首页 > 解决方案 > 为什么我的数据框没有附加到迭代中?

问题描述

我下面的代码在仅与一个 url 一起使用时可以正常工作,但是当我传递 url 列表并且代码完成时,只product_info检索最后一个 url。

我确定我错过了一些东西,但我不知道是什么。

from logging import exception
from selenium import webdriver 
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import random
import pandas as pd

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless')

driver = webdriver.Chrome(executable_path=r"/usr/bin/chromedriver", options=options)

#url = 'https://www.coolmod.com/componentes-pc-procesadores?f=375::No'

url_list = [ 
                'https://www.coolmod.com/componentes-pc-procesadores?f=375::No',
                'https://www.coolmod.com/componentes-pc-placas-base?f=55::ATX||prices::65-255',
                # 'https://www.coolmod.com/componentes-pc-memorias-ram?f=41::16GB||473::No||prices::4-209||9999::3549',
                # 'https://www.coolmod.com/discos-ssd?f=501::M.2%20PCI-E%203.0||501::M.2%20PCI-E%204.0||204::500%20GB||204::960%20GB||204::1%20TB',
                # 'https://www.coolmod.com/componentes-pc-fuentes-alimentacion?f=81::Si||80::750||80::850',
                # 'https://www.coolmod.com/disipadores-ventiladores-disipadores?f=9999::2022||prices::35-95',
                # 'https://www.coolmod.com/componentes-pc-torres-cajas?f=9999::1671||103::ATX||prices::60-170',

]

for url in url_list:
        
    driver.get(url)

    sleep(random.uniform(4.0, 7.5))

    try:
        popup = driver.find_element_by_class_name('confirm').click()
    except NoSuchElementException:
        pass

    iter = 1
    while iter > 0:
        sleep(random.uniform(3.5, 7.5))
        try:
            ver_mas = driver.find_element_by_class_name('button-load-more')
            actions = ActionChains(driver)
            actions.move_to_element(ver_mas).perform()
            driver.execute_script("arguments[0].click();", ver_mas)
        except NoSuchElementException:
            break
        iter += 1

    page_source = driver.page_source

    soup = BeautifulSoup(page_source, 'lxml')
    # print(soup)

    items = soup.find_all('div', class_='col-lg-12 col-md-12 col-sm-8 col-xs-9 cat-container-text')
    # print(len(items))

    df_list = []
    store = 'Coolmod'
    extraction_date = datetime.datetime.today().replace(microsecond=0)

    for item in items:
        product_name = item.find('div',class_ = 'product-name').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        try:
            price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        except ValueError:
            price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        except ValueError:
            price = item.find('div', class_ = 'margin-top-20  mod-product-price  text-medium').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        except:
            pass
        try:
            availability = item.find('div', class_ = 'product-availability cat-product-availability').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        except AttributeError:
            availability = item.find('div', class_ = 'product-availability cat-product-availability local-available').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        except AttributeError:
            availability = "No info"

        product_info = {
            'product_name' : product_name,
            'price' : price,
            'availability' : availability,
            'store' : store,
            'date_extraction' : extraction_date,
        }
        df_list.append(product_info)

df = pd.DataFrame(df_list)
print(df)


site = 'mysite'
path = "C:\\PriceTracking\\coolmod\\"
path = '/home/pi/Documents/WebScraping Files/'+store+'/'
mydate = extraction_date.strftime('%Y%m%d')
mytime = extraction_date.strftime('%H%M%S')
filename = path+store+'_'+mydate+'_'+mytime+".csv"

df.to_csv(filename,index=False)

标签: pythonpandasdataframeweb-scraping

解决方案


只是为了发布答案

    df_list = []
    store = 'Coolmod'
    extraction_date = datetime.datetime.today().replace(microsecond=0)

这种代码和平需要在循环之外,我放在 url_list 之后,现在它工作正常。

谢谢


推荐阅读