python - BeautifulSoup & Selenium - 异常后复制到我的电子表格的数据不正确
问题描述
第一次发帖,欢迎有建设性的批评。
我正在尝试为我的公司自动执行市场分析任务。我编写了一个程序,该程序在大多数情况下都可以正常运行,但是一旦我添加了一个异常来处理邮政编码请求,它就停止了工作。现在 BeautifulSoup 似乎正在加载不正确的源页面并且找不到我需要的数据。
我尝试更新 BeautifulSoup 从中收集源代码的链接变量,但这似乎不起作用。
# Global variables
os.chdir(r'C:\Users\caleb.grauberger\Desktop\Scripts\Market Analysis for GSA')
WB_ORIG = openpyxl.load_workbook("test_pricelist.xlsx")
SHEET_ORIG = WB_ORIG['Sheet1']
MAX_ROW = SHEET_ORIG.max_row
NEW_WB_NAME = 'market_analysis_test.xlsx'
def pricelist_manip():
# Note: This is the starting row number for the next for loop
r = 2
wb_new = openpyxl.load_workbook(NEW_WB_NAME)
sheet_new = wb_new.active
os.chdir(r"C:\Users\user\PycharmProjects\selenium\drivers")
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=chrome_options)
os.chdir(r'C:\Users\caleb.grauberger\Desktop\Scripts\Market Analysis for GSA')
driver.set_page_load_timeout(20)
driver.get("https://www.gsaadvantage.gov/advantage/main/start_page.do")
driver.maximize_window()
for i in range(2, MAX_ROW):
"""Copies the value of the Manufacturer's part number and pastes it into the search box"""
start_time = time.time()
driver.set_page_load_timeout(20)
item_id = SHEET_ORIG.cell(row=i, column=1).value
driver.find_element_by_id("twotabsearchtextbox").clear()
driver.find_element_by_id("twotabsearchtextbox").send_keys(item_id)
driver.find_element_by_id("twotabsearchtextbox").send_keys(Keys.ENTER)
product_links = driver.find_elements_by_xpath("//img[@alt='Click to view product details']/following::a[1]")
print(i - 1)
print(item_id)
links_list = []
for values in product_links:
"""Creates a list of hyperlinks for each product on the page."""
hyperlink = values.get_attribute("href")
links_list.append(hyperlink)
for x in range(len(links_list)):
"""Goes through the links_list one by one. During each iteration,
the program goes to the link, copies the necessary information
to the spreadsheet, and moves on."""
driver.set_page_load_timeout(20)
driver.get(links_list[x])
# Sometimes a link will bring up an option to enter ZIP Code. This deals with that.
try:
driver.find_element_by_id("zip").clear()
driver.find_element_by_id("zip").send_keys('91911')
driver.find_element_by_id("zip").send_keys(Keys.ENTER)
time.sleep(1)
driver.get(links_list[x])
except selenium.common.exceptions.NoSuchElementException:
pass
rows = r
source = requests.get(links_list[x]).text
soup = BeautifulSoup(source, 'lxml')
body = soup.find('body')
try:
part_num = body.find('td', width="78%").text
except AttributeError:
print("AttributeError: Contractor Part Number defaulted")
part_num = item_id
finally:
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for comment in comments:
"""Iterates over html searching for contractor names. Then writes contractor names to xlsx"""
if comment.strip() == 'Contractor display':
contractor = comment.find_next_sibling('td')
sheet_new['C' + str(r)] = contractor.text
sheet_new['A' + str(r)] = item_id
sheet_new['B' + str(r)] = part_num
r += 1
r = rows
for comment in comments:
if comment.strip() == 'unit price':
"""Iterates over html searching for prices. Then writes prices to xlsx"""
price = comment.find_next_sibling('td')
sheet_new['D' + str(r)] = price.text
r += 1
r = rows + 3
time.sleep(1)
wb_new.save(NEW_WB_NAME)
# This checks the runtime of the loop
end_time = time.time()
duration = end_time - start_time
duration = round(duration, 2)
print("Runtime for this iteration was {} seconds.\n".format(duration))
time.sleep(2)
wb_new.save(NEW_WB_NAME)
wb_new.close()
我期望代码为每个项目的每个价格列表输出以下内容:
Part # - Contractor Part # - Contractor Name - Price
我遇到问题的示例是在搜索DF2-850-FDPO-KH-S-R
. 有五个报价通常可以顺利记录。在上面的代码中,没有记录任何结果。
解决方案
这是一个从中受益的建议pandas
(您可以稍后轻松导出到 excel)
import pandas as pd
import re
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
#A panda dataframe where all results will get stored
global_df = pd.DataFrame(columns=['gsin','Part #', 'Contractor Part #', 'Contractor Name', 'Price','Deliv days', 'Min Order', 'FOB/Shipping'])
part = "MCMS00001"
driver.get("https://www.gsaadvantage.gov/advantage/main/start_page.do")
element = driver.find_element_by_id("twotabsearchtextbox")
element.send_keys(part)
driver.find_element_by_class_name('nav-submit-input').click()
#Find links
html = driver.page_source
soup= BeautifulSoup(html, 'html')
links = soup.find_all('a', href = re.compile('gsin='))
links = list(set([x.get('href') for x in links]))
#Explore links
for link in links:
driver.get("https://www.gsaadvantage.gov"+link)
#Sometimes zip pages shows up
try:
element = driver.find_element_by_id("zip")
element.send_keys("91911")
driver.find_element_by_name('submit').click()
except:
pass
html = driver.page_source
soup= BeautifulSoup(html, 'html')
#Get the table with the expected elements
table = soup.find('table', {'class':'greybox'})
item_numbers = []
#Extract itemNumbers
for tr in table.find_all('tr')[1:]: #skip the header line
item_number = tr.find('a', href=re.compile('itemNumber='))
if item_number != None:
pattern = 'itemNumber=(.*?)\&'
item_numbers.append(re.findall(pattern, item_number.get('href'))[0])
#Store table into a dataframe
temp_df = pd.read_html(str(soup.find('table', {'class':'greybox'})))[0]
#EDIT START : handle missing columns
for x in [2,8,14,16,18]:
if x not in temp_df.columns:
temp_df[x] = ""
temp_df = temp_df[[2,8,14,16,18]].dropna(how='all')
temp_df = temp_df[1:]
temp_df.columns = ['Price', 'Contractor Name', 'Deliv days', 'Min Order', 'FOB/Shipping']
temp_df['Contractor Part #'] = item_numbers
temp_df['Part #'] = part
#Extracting gsin from the explored link which differs in case of multiple links
gsin = link.split("gsin=")[1]
temp_df['gsin'] = gsin
temp_df = temp_df[['gsin','Part #', 'Contractor Part #', 'Contractor Name', 'Price','Deliv days', 'Min Order', 'FOB/Shipping']]
#EDIT END
#Append new results to the global dataframe
global_df = pd.concat([global_df, temp_df])
最后 global_df 数据框如下所示:
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
| | gsin | Part # | Contractor Part # | Contractor Name | Price | Deliv days | Min Order | FOB/Shipping |
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
| 1 | 11000058164089 | MCMS00001 | MCMS00001-MULCM-V54MIL-REGX-XXL | UNIFIRE INC | $154.15 | 30 days delivered ARO | O-CONUS,AK,PR,HI | |
| 1 | 11000058164195 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-L | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058164195 | MCMS00001 | MCMS00001-MULCM-V54MILREG-L | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
| 1 | 11000063336537 | MCMS00001 | MCMS00001 | o | $153.99 | $50.00 | D-CONUSND-AK,PR,HI | NaN |
| 3 | 11000063336537 | MCMS00001 | MCMS00001 | s dv | $157.16 | $50.00 | O-CONUS,AK,PR,HI | NaN |
| 1 | 11000058164083 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-S | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058164083 | MCMS00001 | MCMS00001-MULCM-V54MILREG-S | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163538 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-M | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058163538 | MCMS00001 | MCMS00001-MULCM-V54MILREG-M | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163624 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-XS | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058163624 | MCMS00001 | MCMS00001-MULCM-V54MILREG-XS | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163625 | MCMS00001 | MCMS00001-MULCM-V54MIL-REGXX-XXXL | UNIFIRE INC | $154.15 | 30 days delivered ARO | O-CONUS,AK,PR,HI | |
| 1 | 11000058163763 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-XL | UNIFIRE INC | $154.15 | 30 days delivered ARO | NaN | O-CONUS,AK,PR,HI |
| 3 | 11000058163763 | MCMS00001 | MCMS00001-MULCM-V54MILREG-XL | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO | $100.00 | D-CONUS/O-AK,PR,HI |
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
推荐阅读
- spring - 在我的 Spring Boot 中添加 facebook social 后无法使用任何控制器的 post 方法
- c# - 将 2 个不同类的 IList 组合成 1 个 IList,加入一个公共属性
- reactjs - 将我的 React Native 项目文件夹移动到新机器上,我如何让它运行?
- apache-flink - Flink区间join不输出
- php - 使用 PHP7 的 Symfony 2.8 项目:需要扩展 DOM
- scala - Scala - 从 spark 数据框中获取空列名称的最有效方法是什么?
- java - 需要帮助修复无限循环
- angular - 加载和接收服务器响应后刷新元素
- testing - 是否可以扩展 TestCafe Studio?
- laravel - 更新 Laravel Migration 命令以一次性获取所有常见的导入和字段以获取迁移文件