python - 为什么我的数据框没有附加到迭代中?
问题描述
我下面的代码在仅与一个 url 一起使用时可以正常工作,但是当我传递 url 列表并且代码完成时,只product_info
检索最后一个 url。
我确定我错过了一些东西,但我不知道是什么。
from logging import exception
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import random
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"/usr/bin/chromedriver", options=options)
#url = 'https://www.coolmod.com/componentes-pc-procesadores?f=375::No'
url_list = [
'https://www.coolmod.com/componentes-pc-procesadores?f=375::No',
'https://www.coolmod.com/componentes-pc-placas-base?f=55::ATX||prices::65-255',
# 'https://www.coolmod.com/componentes-pc-memorias-ram?f=41::16GB||473::No||prices::4-209||9999::3549',
# 'https://www.coolmod.com/discos-ssd?f=501::M.2%20PCI-E%203.0||501::M.2%20PCI-E%204.0||204::500%20GB||204::960%20GB||204::1%20TB',
# 'https://www.coolmod.com/componentes-pc-fuentes-alimentacion?f=81::Si||80::750||80::850',
# 'https://www.coolmod.com/disipadores-ventiladores-disipadores?f=9999::2022||prices::35-95',
# 'https://www.coolmod.com/componentes-pc-torres-cajas?f=9999::1671||103::ATX||prices::60-170',
]
for url in url_list:
driver.get(url)
sleep(random.uniform(4.0, 7.5))
try:
popup = driver.find_element_by_class_name('confirm').click()
except NoSuchElementException:
pass
iter = 1
while iter > 0:
sleep(random.uniform(3.5, 7.5))
try:
ver_mas = driver.find_element_by_class_name('button-load-more')
actions = ActionChains(driver)
actions.move_to_element(ver_mas).perform()
driver.execute_script("arguments[0].click();", ver_mas)
except NoSuchElementException:
break
iter += 1
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
# print(soup)
items = soup.find_all('div', class_='col-lg-12 col-md-12 col-sm-8 col-xs-9 cat-container-text')
# print(len(items))
df_list = []
store = 'Coolmod'
extraction_date = datetime.datetime.today().replace(microsecond=0)
for item in items:
product_name = item.find('div',class_ = 'product-name').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
try:
price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except ValueError:
price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except ValueError:
price = item.find('div', class_ = 'margin-top-20 mod-product-price text-medium').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except:
pass
try:
availability = item.find('div', class_ = 'product-availability cat-product-availability').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
availability = item.find('div', class_ = 'product-availability cat-product-availability local-available').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
availability = "No info"
product_info = {
'product_name' : product_name,
'price' : price,
'availability' : availability,
'store' : store,
'date_extraction' : extraction_date,
}
df_list.append(product_info)
df = pd.DataFrame(df_list)
print(df)
site = 'mysite'
path = "C:\\PriceTracking\\coolmod\\"
path = '/home/pi/Documents/WebScraping Files/'+store+'/'
mydate = extraction_date.strftime('%Y%m%d')
mytime = extraction_date.strftime('%H%M%S')
filename = path+store+'_'+mydate+'_'+mytime+".csv"
df.to_csv(filename,index=False)
解决方案
只是为了发布答案
df_list = []
store = 'Coolmod'
extraction_date = datetime.datetime.today().replace(microsecond=0)
这种代码和平需要在循环之外,我放在 url_list 之后,现在它工作正常。
谢谢
推荐阅读
- php - php中的上标
- python - 在边界框中打印得分值
- sql - 如何在 Terada SQL 中选择从当前日期起最近 30 天内至少进行 2 次交易的客户?
- sql - 从每个组中获取随机唯一记录
- reactjs - 单击按钮时显示圆形不确定进度
- c# - 如何停止 Task.Run()?
- azure - 无法为 cosmos db 分配 Azure 角色
- nlp - 识别对比两个语料库的短语
- animation - 如何在 bootstrap 4 中应用 bootstrap 3 模态动画过渡效果
- flutter - 从flutter模拟器发送http请求到localhost api/win10