首页 > 解决方案 > BeautifulSoup & Selenium - 异常后复制到我的电子表格的数据不正确

问题描述

第一次发帖,欢迎有建设性的批评。

我正在尝试为我的公司自动执行市场分析任务。我编写了一个程序,该程序在大多数情况下都可以正常运行,但是一旦我添加了一个异常来处理邮政编码请求,它就停止了工作。现在 BeautifulSoup 似乎正在加载不正确的源页面并且找不到我需要的数据。

我尝试更新 BeautifulSoup 从中收集源代码的链接变量,但这似乎不起作用。


# Global variables
os.chdir(r'C:\Users\caleb.grauberger\Desktop\Scripts\Market Analysis for GSA')
WB_ORIG = openpyxl.load_workbook("test_pricelist.xlsx")
SHEET_ORIG = WB_ORIG['Sheet1']
MAX_ROW = SHEET_ORIG.max_row
NEW_WB_NAME = 'market_analysis_test.xlsx'


def pricelist_manip():

    # Note: This is the starting row number for the next for loop
    r = 2
    wb_new = openpyxl.load_workbook(NEW_WB_NAME)
    sheet_new = wb_new.active

    os.chdir(r"C:\Users\user\PycharmProjects\selenium\drivers")
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(options=chrome_options)

    os.chdir(r'C:\Users\caleb.grauberger\Desktop\Scripts\Market Analysis for GSA')

    driver.set_page_load_timeout(20)
    driver.get("https://www.gsaadvantage.gov/advantage/main/start_page.do")
    driver.maximize_window()

    for i in range(2, MAX_ROW):
        """Copies the value of the Manufacturer's part number and pastes it into the search box"""
        start_time = time.time()

        driver.set_page_load_timeout(20)
        item_id = SHEET_ORIG.cell(row=i, column=1).value
        driver.find_element_by_id("twotabsearchtextbox").clear()
        driver.find_element_by_id("twotabsearchtextbox").send_keys(item_id)
        driver.find_element_by_id("twotabsearchtextbox").send_keys(Keys.ENTER)

        product_links = driver.find_elements_by_xpath("//img[@alt='Click to view product details']/following::a[1]")
        print(i - 1)
        print(item_id)
        links_list = []

        for values in product_links:
            """Creates a list of hyperlinks for each product on the page."""
            hyperlink = values.get_attribute("href")
            links_list.append(hyperlink)

        for x in range(len(links_list)):
            """Goes through the links_list one by one. During each iteration, 
            the program goes to the link, copies the necessary information
            to the spreadsheet, and moves on."""
            driver.set_page_load_timeout(20)
            driver.get(links_list[x])

            # Sometimes a link will bring up an option to enter ZIP Code. This deals with that.

            try:
                driver.find_element_by_id("zip").clear()
                driver.find_element_by_id("zip").send_keys('91911')
                driver.find_element_by_id("zip").send_keys(Keys.ENTER)
                time.sleep(1)
                driver.get(links_list[x])
            except selenium.common.exceptions.NoSuchElementException:
                pass


            rows = r

            source = requests.get(links_list[x]).text
            soup = BeautifulSoup(source, 'lxml')
            body = soup.find('body')
            try:
                part_num = body.find('td', width="78%").text
            except AttributeError:
                print("AttributeError: Contractor Part Number defaulted")
                part_num = item_id
            finally:
                comments = soup.find_all(string=lambda text: isinstance(text, Comment))

                for comment in comments:
                    """Iterates over html searching for contractor names. Then writes contractor names to xlsx"""
                    if comment.strip() == 'Contractor display':
                        contractor = comment.find_next_sibling('td')
                        sheet_new['C' + str(r)] = contractor.text
                        sheet_new['A' + str(r)] = item_id
                        sheet_new['B' + str(r)] = part_num
                        r += 1

                r = rows

                for comment in comments:
                    if comment.strip() == 'unit price':
                        """Iterates over html searching for prices. Then writes prices to xlsx"""
                        price = comment.find_next_sibling('td')
                        sheet_new['D' + str(r)] = price.text
                        r += 1

                r = rows + 3
                time.sleep(1)

        wb_new.save(NEW_WB_NAME)

        # This checks the runtime of the loop
        end_time = time.time()
        duration = end_time - start_time
        duration = round(duration, 2)
        print("Runtime for this iteration was {} seconds.\n".format(duration))
        time.sleep(2)

    wb_new.save(NEW_WB_NAME)
    wb_new.close()

我期望代码为每个项目的每个价格列表输出以下内容:

Part # - Contractor Part # - Contractor Name - Price

我遇到问题的示例是在搜索DF2-850-FDPO-KH-S-R. 有五个报价通常可以顺利记录。在上面的代码中,没有记录任何结果。

标签: pythonseleniumbeautifulsoup

解决方案


这是一个从中受益的建议pandas(您可以稍后轻松导出到 excel)

import pandas as pd
import re
from selenium import webdriver
from bs4 import BeautifulSoup

driver = webdriver.Chrome()
#A panda dataframe where all results will get stored
global_df = pd.DataFrame(columns=['gsin','Part #', 'Contractor Part #', 'Contractor Name', 'Price','Deliv days',  'Min Order', 'FOB/Shipping'])

part = "MCMS00001"
driver.get("https://www.gsaadvantage.gov/advantage/main/start_page.do")
element = driver.find_element_by_id("twotabsearchtextbox")
element.send_keys(part)
driver.find_element_by_class_name('nav-submit-input').click()


#Find links
html = driver.page_source
soup= BeautifulSoup(html, 'html')
links = soup.find_all('a', href = re.compile('gsin='))
links = list(set([x.get('href') for x in links]))

#Explore links
for link in links:
    driver.get("https://www.gsaadvantage.gov"+link)

    #Sometimes zip pages shows up
    try:
        element = driver.find_element_by_id("zip")
        element.send_keys("91911")
        driver.find_element_by_name('submit').click()
    except:
        pass

    html = driver.page_source
    soup= BeautifulSoup(html, 'html')

    #Get the table with the expected elements
    table = soup.find('table', {'class':'greybox'})
    item_numbers = []

    #Extract itemNumbers
    for tr in table.find_all('tr')[1:]: #skip the header line
        item_number = tr.find('a', href=re.compile('itemNumber='))

        if item_number != None:
            pattern = 'itemNumber=(.*?)\&'        
            item_numbers.append(re.findall(pattern, item_number.get('href'))[0])

    #Store table into a dataframe
    temp_df = pd.read_html(str(soup.find('table', {'class':'greybox'})))[0]

    #EDIT START : handle missing columns
    for x in [2,8,14,16,18]:
        if x not in temp_df.columns:
            temp_df[x] = ""

    temp_df = temp_df[[2,8,14,16,18]].dropna(how='all')
    temp_df = temp_df[1:]
    temp_df.columns = ['Price', 'Contractor Name', 'Deliv days',  'Min Order', 'FOB/Shipping']
    temp_df['Contractor Part #'] = item_numbers
    temp_df['Part #'] = part

    #Extracting gsin from the explored link which differs in case of multiple links
    gsin = link.split("gsin=")[1]
    temp_df['gsin'] = gsin

    temp_df = temp_df[['gsin','Part #', 'Contractor Part #', 'Contractor Name', 'Price','Deliv days',  'Min Order', 'FOB/Shipping']]


    #EDIT END
    #Append new results to the global dataframe
    global_df = pd.concat([global_df, temp_df])

最后 global_df 数据框如下所示:

+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
|   |      gsin      |  Part #   |         Contractor Part #         |    Contractor Name    |  Price  |      Deliv days       |     Min Order      |    FOB/Shipping    |
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+
| 1 | 11000058164089 | MCMS00001 | MCMS00001-MULCM-V54MIL-REGX-XXL   | UNIFIRE INC           | $154.15 | 30 days delivered ARO | O-CONUS,AK,PR,HI   |                    |
| 1 | 11000058164195 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-L      | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058164195 | MCMS00001 | MCMS00001-MULCM-V54MILREG-L       | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
| 1 | 11000063336537 | MCMS00001 | MCMS00001                         | o                     | $153.99 | $50.00                | D-CONUSND-AK,PR,HI | NaN                |
| 3 | 11000063336537 | MCMS00001 | MCMS00001                         | s dv                  | $157.16 | $50.00                | O-CONUS,AK,PR,HI   | NaN                |
| 1 | 11000058164083 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-S      | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058164083 | MCMS00001 | MCMS00001-MULCM-V54MILREG-S       | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163538 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-M      | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058163538 | MCMS00001 | MCMS00001-MULCM-V54MILREG-M       | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163624 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-XS     | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058163624 | MCMS00001 | MCMS00001-MULCM-V54MILREG-XS      | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
| 1 | 11000058163625 | MCMS00001 | MCMS00001-MULCM-V54MIL-REGXX-XXXL | UNIFIRE INC           | $154.15 | 30 days delivered ARO | O-CONUS,AK,PR,HI   |                    |
| 1 | 11000058163763 | MCMS00001 | MCMS00001-MULCM-V54MIL-REG-XL     | UNIFIRE INC           | $154.15 | 30 days delivered ARO | NaN                | O-CONUS,AK,PR,HI   |
| 3 | 11000058163763 | MCMS00001 | MCMS00001-MULCM-V54MILREG-XL      | Blue Water Sales, LLC | $176.47 | 14 days shipped ARO   | $100.00            | D-CONUS/O-AK,PR,HI |
+---+----------------+-----------+-----------------------------------+-----------------------+---------+-----------------------+--------------------+--------------------+

推荐阅读