[Answered ]-How to create new collection datatabase after each scraping execution?

1👍

✅

MongoDB creates new databases and collections once you try to reference them. So by changing the referenced db and/or collection at the end of your scraping run, you could write to a new db/collection.
To create a new collection for every subject, you could change the database reference in your scrap method to something like:

client = MongoClient("mongodb://localhost:27017/")    

# use variable db and collection names
collection_name = subject
collection = client["db2"][collection_name]     

data = df.to_dict(orient = 'records')     
collection.insert_many(data)

To use the variable reference in the datatable_view, you would have to use the subject from the POST request. Using the following snippet might brake your request, but allows you to read the crawled subject dynamically:

def datatable_view(request):
    if request.method =='POST':
        form = Scraping(request.POST)
        if form.is_valid():
            subject=form.cleaned_data['subject']
            scrap(subject)
        
            client = pymongo.MongoClient("mongodb://localhost:27017/")
            # use variable names for db and collection reference
            db= client["db2"]
            col = db[subject]
            products = col.find()
            context = {'products' : products}
            return render(request,'datatable.html', context)
    return

👤Leon Menkreo

[Answered ]-Reverse query via foreign key and many to many field in Django

0👍

You are always saving the date in the collection called aliex2 which is the reason it is storing in the same collection. If you want to store in a new collection, everytime you scrape new data make sure you use a unique collection name instead of aliex2

👤JEFFRIN JACOB

0👍

By watching you problem I got an Idea may it helps you:

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import time


def scrap(subject):
    start_time = time.time()
    options = Options()
    options.headless = True
    driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
    url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText='+subject+'&ltype=wholesale&SortType=default&page={}'
    # baseurl = 'https://www.aliexpress.com'

    for page_nb in range(1, 5):
        print('---', page_nb, '---')    
        driver.get(url.format(page_nb))
        sleep(2)
        current_offset = 0
        while True:
            driver.execute_script("window.scrollBy(0, window.innerHeight);")
            sleep(.5)  # JavaScript has time to add elements
            new_offset = driver.execute_script("return window.pageYOffset;")
            if new_offset <= current_offset:
                break
            current_offset = new_offset
        sleep(3)
        tree = html.fromstring(driver.page_source)
        results = []
        for product in tree.xpath('//div[@class="JIIxO"]//a'):
            title = product.xpath('.//h1/text()')
            if title:
                title = title[0]
                price = product.cssselect('div.mGXnE._37W_B span')
                price = [x.text for x in price]

                currency = price[0]
                price = ''.join(price[1:])
                stars = product.xpath('.//span[@class="eXPaM"]/text()')
                if stars :
                    stars  = stars [0]
                else:
                    stars  = 'None'
                nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
                if nb_sold:
                    nb_sold = nb_sold[0]
                else:
                    nb_sold = 'None'
                supl = product.xpath('.//a[@class="ox0KZ"]/text()')
                if supl:
                    supl = supl[0]
                else:
                    supl = 'None'
                ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
                if ship_cost:
                    ship_cost = ship_cost[0]
                else:
                    ship_cost = 'None'
                product_links = product.xpath('./@href')
                if product_links:
                    product_links = str( product_links[0])
                row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
                results.append(row)
        # driver.close()-------Remove this code so driver is open and can open URL
    #I just put this below line out of the loop
    df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))
    client = MongoClient("mongodb://localhost:27017/")     
    collection = client['db2']['aliex2']     
    data = df.to_dict(orient = 'records')     
    collection.insert_many(data)
        
    print("--- %s seconds ---" % (time.time() - start_time))
    driver.quit()

    return

I had put database four line out off the page_nb loop so what it will do is rows of the all page will added to result list and at last result list will be added to dataframe and to database.

datatable view and datatable HTML will Remain same

Try this simple Idea to solve your problem.

👤Devam Sanghvi

Source:stackexchange.com

Leave a comment Cancel reply