python - 清理 DataFrame 输出 | 硒 | Python
问题描述
我已经设置了一个脚本来遍历一组 URL。该脚本运行良好,但我不知道如何调整以产生更清晰的 CSV 输出。
我会尽我所能,尽量减少清理格式、删除 Excel 单元格等所需的时间。
注意:我刮卷文本的方式是我弄清楚如何获得所需内容的唯一方法。希望我们能找到一个好的解决方案来改进最终输出,而不会影响脚本的这一部分。
这是我的脚本:
group_url = [
'https://www.example.com',
'https://www.example2.com',
'https://www.example3.com',
'https://www.example4.com',
]
data = []
for group in group_url:
driver.get(group)
wait = WebDriverWait(driver, 90)
element = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')))
time.sleep(3)
kws = driver.find_elements_by_css_selector(".css-hijzdp-base")
counter = 1
for kw in kws:
if counter <= 5:
try:
data.append({
"Keyword": kw.text
})
counter = counter + 1
except NoSuchElementException:
pass
urls = driver.find_elements_by_css_selector(".css-a5m6co-text.css-p8ym46-fontFamily.css-11397xj-fontSize.css-18j1nfb-display")
count = 1
for url in urls:
if count <= 5:
try:
data.append({
"URL": url.text
})
count = count + 1
except NoSuchElementException:
pass
try:
vol1 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume1": vol1.text
})
try:
vol2 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[2]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume2": vol2.text
})
try:
vol3 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[3]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume3": vol3.text
})
try:
vol4 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[4]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume4": vol4.text
})
try:
vol5 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[5]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume5": vol5.text
})
driver.close()
print(data)
#print to csv
df = pd.DataFrame(data)
df.to_csv('testOutput 11_11_21.csv')
解决方案
您将每个行项目独立附加到data
. 首先在 for 循环中的字典中收集它们,然后将字典附加到列表中data
:
group_url = [
'https://www.example.com',
'https://www.example2.com',
'https://www.example3.com',
'https://www.example4.com',
]
data = []
for group in group_url:
tmp_dict = {}
driver.get(group)
wait = WebDriverWait(driver, 90)
element = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')))
time.sleep(3)
kws = driver.find_elements_by_css_selector(".css-hijzdp-base")
counter = 1
keywords = []
for kw in kws:
if counter <= 5:
try:
keywords.append(kw.text)
counter = counter + 1
except NoSuchElementException:
pass
tmp_dict["Keyword"] = keywords
urls = driver.find_elements_by_css_selector(".css-a5m6co-text.css-p8ym46-fontFamily.css-11397xj-fontSize.css-18j1nfb-display")
count = 1
urls_results = []
for url in urls:
if count <= 5:
try:
urls_results.append(url.text)
count = count + 1
except NoSuchElementException:
pass
tmp_dict["URL"] = urls_results
try:
vol1 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume1"]= vol1.text
try:
vol2 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[2]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume2"]= vol2.text
try:
vol3 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[3]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume3"]= vol3.text
try:
vol4 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[4]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume4"]= vol4.text
try:
vol5 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[5]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume5"]= vol5.text
data.append(tmp_dict)
driver.close()
print(data)
#print to csv
df = pd.DataFrame(data)
df.to_csv('testOutput 11_11_21.csv')
推荐阅读
- flickity - Flickity 如何使轮播根据图像大小调整大小?
- php - 第 150 行 /app/public/wp-includes/class-walker-nav-menu.php 中传递的参数无效
- angular - 如果路由到相同的参数,角度不会重新加载
- wordpress - 菜单中主页部分的自定义链接在其他页面上不起作用
- outlook-addin - Outlook-VSTO-AddIn 活动检查器在邮件项加载时为空
- javascript - 是否可以从 REST Service Bing 地图中获取城市名称?
- vb.net - 找不到类型。请确保引用了包含此类型的程序集
- spring - 迁移到 Spring Boot 2.x 后,健康检查失败并出现 404 错误
- linux - websphere 上的其他语言字符问题
- reactjs - 正确更新嵌套属性的状态