python - Selenium Web Scrape - 为什么这个脚本返回 500k 行?
问题描述
我制作了一个脚本来抓取网站以获取某些类别的所有产品信息,但是当某个类别中只有 3000 件商品时,我的代码返回了 500 000 多行。
我对 Python 也很陌生,因此感谢您提供任何帮助。
代码附在下面:
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 4 20:31:23 2019
@author:
"""
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.support.ui as ui
import selenium.webdriver.support.expected_conditions as EC
from bs4 import BeautifulSoup
import os, sys
import time
from urllib.parse import urljoin
import pandas as pd
import re
import numpy as np
# base set up
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
os.chdir("C:/Users/user/desktop/scripts/python")
cwd = os.getcwd()
main_dir = os.path.abspath(os.path.join(cwd, os.pardir))
print('Main Directory:', main_dir)
chromedriver = ("C:/Users/user/desktop/scripts/python/chromedriver.exe")
os.environ["webdriver.chrome.driver"] = chromedriver
# browser = webdriver.Chrome(options=options, executable_path=chromedriver)
mainurl = "https://www.bunnings.com.au/our-range"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
page = requests.get(mainurl, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
# script start
subcat = []
for item in soup.findAll('ul', attrs={'class': 'chalkboard-menu'}):
links = item.find_all('a')
for link in links:
subcat.append(urljoin(mainurl, link.get("href")))
subcat
result = pd.DataFrame()
for adrs in subcat[0:1]:
# headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
# page = requests.get(adrs, headers=headers)
# soup = BeautifulSoup(page.content, 'html.parser')
# pagelink = adrs
# adrs="https://www.bunnings.com.au/our-range/storage-cleaning/cleaning/brushware-mops/indoor-brooms"
catProd = pd.DataFrame()
url = adrs
browser = webdriver.Chrome(options=options, executable_path=chromedriver)
browser.get(url)
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match = False
while (match == False):
lastCount = lenOfPage
time.sleep(3)
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount == lenOfPage:
match = True
reached= False
while (reached==False):
try:
browser.find_element_by_css_selector('#MoreProductsButton > span').click()
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match = True
while (match == True):
lastCount = lenOfPage
time.sleep(3)
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount == lenOfPage:
match = True
browser.find_element_by_css_selector('#content-layout_inside-anchor > div.search-result__content > div > div > section > div:nth-child(4) > div > div:nth-child(2) > div > button > div.view-more_btn_text').click()
except:
reached=True
# grab the items
page = browser.page_source
soup = BeautifulSoup(page, 'html.parser')
browser.close()
for article in soup.findAll('article', attrs={'class':'product-list__item hproduct special-order-product'}):
for product in article.findAll('img', attrs={'class': 'photo'}):
pName = product['alt']
pCat = adrs
pID = article['data-product-id']
temp= pd.DataFrame({'proID':[pID],'Product':[pName],'Category':[pCat]})
catProd=catProd.append(temp)
result = result.append(catProd)
time.sleep(3)
result.head()
#writes to CSV
writer = pd.ExcelWriter('test123123.xlsx')
result.to_excel(writer,'Sheet1')
writer.save()
代码需要大约 20 分钟来遍历 3000~ 个项目,这在我看来很疯狂,但主要问题仍然在于,当我只需要 3500 行时,我得到了太多的重复项和 500 000 行。类别。
解决方案
问题就在这里:
for product in article.findAll('img', attrs={'class': 'photo'}):
pName = product['alt']
pCat = adrs
pID = article['data-product-id']
temp= pd.DataFrame({'proID':[pID],'Product':[pName],'Category':[pCat]}) #<-------------- temp DataFrame
catProd=catProd.append(temp) #<------------ temp appending into catProd dataframe
result = result.append(catProd) #<----------- catProd appending into result DataFrame
你基本上是在做双重附加,它需要你的temp
数据帧,然后附加到你的catProd
数据帧......然后在附加到你的数据帧之后result
。因此,您的结果数据框呈指数级增长。
有几种方法可以解决此问题。一个是将您移到result = result.append(temp)
该循环之外,因此在 FULL填充后catProd
追加。或者,只是一起消除你的所有并继续附加到你的.result
catProd
catProd
result
我还清理了一些东西。IE。重置数据框的索引,不包括 excel 写入中的索引。我还添加了显式等待(即等待按钮出现),而不是 time.sleep,这应该会加快一点速度。
完整代码如下。不要忘记进行更改for adrs in subcat[0:1]
,以使其贯穿整个列表。我只是让它通过第一个网址。
最后一件事是我在那里投入了一种计时方式。只需运行第一个 url,895 个产品并保存到 csv 中,Duration: 0 Hours, 02 Minutes, 48 Seconds
最后,我必须注释掉一些像 os.chdir 这样的东西,以便我可以运行它。所以不要忘记取消注释那些东西。
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.support.ui as ui
from selenium.webdriver.support.ui import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
from bs4 import BeautifulSoup
import os, sys
import time
from urllib.parse import urljoin
import pandas as pd
import re
import numpy as np
import datetime
# base set up
start_time = datetime.datetime.now()
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
#os.chdir("C:/Users/user/desktop/scripts/python")
#cwd = os.getcwd()
#main_dir = os.path.abspath(os.path.join(cwd, os.pardir))
#print('Main Directory:', main_dir)
chromedriver = ("C:/chromedriver_win32/chromedriver.exe")
os.environ["webdriver.chrome.driver"] = chromedriver
# browser = webdriver.Chrome(options=options, executable_path=chromedriver)
mainurl = "https://www.bunnings.com.au/our-range"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
page = requests.get(mainurl, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
# script start
subcat = []
for item in soup.findAll('ul', attrs={'class': 'chalkboard-menu'}):
links = item.find_all('a')
for link in links:
subcat.append(urljoin(mainurl, link.get("href")))
subcat
result = pd.DataFrame()
for adrs in subcat:
# headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
# page = requests.get(adrs, headers=headers)
# soup = BeautifulSoup(page.content, 'html.parser')
# pagelink = adrs
# adrs="https://www.bunnings.com.au/our-range/storage-cleaning/cleaning/brushware-mops/indoor-brooms"
catProd = pd.DataFrame()
url = adrs
browser = webdriver.Chrome(options=options, executable_path=chromedriver)
browser.get(url)
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match = False
while (match == False):
lastCount = lenOfPage
#time.sleep(3)
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount == lenOfPage:
match = True
reached= False
while (reached==False):
try:
wait = WebDriverWait(browser, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#MoreProductsButton")))
browser.find_element_by_css_selector('#MoreProductsButton').click()
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match = True
while (match == True):
lastCount = lenOfPage
#time.sleep(3)
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount == lenOfPage:
match = True
#time.sleep(3)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.view-more_btn_text")))
browser.find_element_by_css_selector('#content-layout_inside-anchor > div.search-result__content > div > div > section > div:nth-child(4) > div > div:nth-child(2) > div > button > div.view-more_btn_text').click()
except:
reached=True
# grab the items
page = browser.page_source
soup = BeautifulSoup(page, 'html.parser')
browser.close()
for article in soup.findAll('article', attrs={'class':'product-list__item hproduct special-order-product'}):
for product in article.findAll('img', attrs={'class': 'photo'}):
pName = product['alt']
pCat = adrs
pID = article['data-product-id']
temp= pd.DataFrame({'proID':[pID],'Product':[pName],'Category':[pCat]})
#catProd=catProd.append(temp)
result = result.append(temp)
#time.sleep(3)
result.head()
result.reset_index(drop=True)
#writes to CSV
writer = pd.ExcelWriter('C:/test123123.xlsx')
result.to_excel(writer,'Sheet1', index=False)
writer.save()
finish_time = datetime.datetime.now()
duration = finish_time - start_time
dur_list = str(duration).split(':')
hour = dur_list[0]
minutes = dur_list[1]
seconds = dur_list[2].split('.')[0]
print ('Duration: %s Hours, %s Minutes, %s Seconds' %(hour, minutes, seconds))
推荐阅读
- apache-spark-sql - 如何从以下架构中选择列“?
- c++ - 为什么向量化在这个 for 循环中没有好处?
- c++ - VS 2019(Visual Studio)Linux makefile 项目 - 从 Windows 编译时“没有这样的文件或目录”
- regex - 字符串的正则表达式,可以包含数字、字母或两者的组合。不要包含符号或特殊字符
- kubernetes - 谁能告诉我在执行 kubeadm upgrade apply 时如何避免升级 CoreDns?
- python-3.x - 无法导入 sublime_plugin
- onmouseover - 播放事件在 HTML 和 Javascript 中不起作用
- javascript - 防止视频在移动设备上下载某些源
- java - 我得到一个错误'修饰符'静态'只允许在常量变量声明中'?
- python - 连接多个文件python