1👍
MongoDB creates new databases and collections once you try to reference them. So by changing the referenced db and/or collection at the end of your scraping run, you could write to a new db/collection.
To create a new collection for every subject, you could change the database reference in your scrap method to something like:
client = MongoClient("mongodb://localhost:27017/")
# use variable db and collection names
collection_name = subject
collection = client["db2"][collection_name]
data = df.to_dict(orient = 'records')
collection.insert_many(data)
To use the variable reference in the datatable_view, you would have to use the subject from the POST request. Using the following snippet might brake your request, but allows you to read the crawled subject dynamically:
def datatable_view(request):
if request.method =='POST':
form = Scraping(request.POST)
if form.is_valid():
subject=form.cleaned_data['subject']
scrap(subject)
client = pymongo.MongoClient("mongodb://localhost:27017/")
# use variable names for db and collection reference
db= client["db2"]
col = db[subject]
products = col.find()
context = {'products' : products}
return render(request,'datatable.html', context)
return
0👍
You are always saving the date in the collection called aliex2
which is the reason it is storing in the same collection. If you want to store in a new collection, everytime you scrape new data make sure you use a unique collection name instead of aliex2
- [Answered ]-Using key as value in Mongoengine
- [Answered ]-How to create django model by pressing button
- [Answered ]-How can I prevent duplicate usernames, given that they are case sensitive by default?
- [Answered ]-Django FileField – how to save files to different directories
- [Answered ]-Django catching http-referer not working always
0👍
By watching you problem I got an Idea may it helps you:
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import time
def scrap(subject):
start_time = time.time()
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText='+subject+'<ype=wholesale&SortType=default&page={}'
# baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 5):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@class="eXPaM"]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@class="ox0KZ"]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str( product_links[0])
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
results.append(row)
# driver.close()-------Remove this code so driver is open and can open URL
#I just put this below line out of the loop
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
print("--- %s seconds ---" % (time.time() - start_time))
driver.quit()
return
I had put database four line out off the page_nb loop so what it will do is rows of the all page will added to result list and at last result list will be added to dataframe and to database.
datatable view and datatable HTML will Remain same
Try this simple Idea to solve your problem.
- [Answered ]-Django, MySQL, and Cloud9
- [Answered ]-Passing a serialized object through a URL
- [Answered ]-Save uploaded files in subfolder depending on request