首页 > 解决方案 > 选择带有关键字的页面并抓取它

问题描述

我正在尝试在网站的外国版本上抓取项目的标题。运行 Python 脚本后,cli 启动但什么也不返回。

在 iPython 中,要获得标题,title = soup.find('a', {'class': 'vip'}).text它本身就可以很好地工作,但在完整代码中的 Pycharm 中它并没有,即使我去我的设置下载了我当前解释器的 BeautifulSoup 包。

知道为什么吗?谢谢。

#!/usr/bin/python3

import csv
import time
import requests
from bs4 import BeautifulSoup

product_category = input("Enter your product category: ")


def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('Server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser')
    return soup


def get_detail_data(soup):
    # title
    try:
        title = soup.find('a', {'class': 'vip'}).text

    except:
        title = ''

    # price
    try:
        price = soup.find_all('span', {'itemprop': 'price'})
        for p in price:
            price = p.get('content')
    except:
        price = ''

    # currency
    try:
        currency = soup.find_all('span', {'class': 'bold'}).text

    except:
        currency = ''

    # items sold
    try:
        i_s = soup.find('div', {'class': 'hotness-signal red'}).text
        items_sold = i_s.strip().split(' ')[0]
    except:
        items_sold = ''

    data = {
        'title': title,
        'price': price,
        'currency': currency,
        'total sold': items_sold
    }

    return data


def get_index_data(soup):
    try:
        links = soup.find_all('a', class_='s-item__link')
    except:
        links = []

    urls = [item.get('href') for item in links]

    return urls


def write_csv(data, url):
    with open('output.csv', 'a') as csvfile:
        writer = csv.writer(csvfile)
        row = [data['title'], data['price'], data['currency'], data['total sold'], url]
        writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
        writer.writerow(row)


def main():
    # Store URL formats for each search engine with placeholders
    url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
    print(url)
    products = get_index_data(get_page(url))

    for link in products:
        time.sleep(7)
        data = get_detail_data(get_page(link))
        print(data)
        write_csv(data, link)


if __name__ == '__main__':
     main()

标签: python-3.xbeautifulsoup

解决方案


似乎 .fr 站点使用不同的标记,因此您需要相应地更改类名/属性。

例如:

import re
import csv
import time
import requests
from bs4 import BeautifulSoup

product_category = input("Enter your product category: ")


def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('Server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser')
    return soup


def get_detail_data(soup):
    # title
    try:
        title = soup.select_one('h1[itemprop="name"]')
        for span in title.select('span'):
            span.extract()
        title = title.get_text(strip=True)
    except:
        title = ''

    # price
    try:
        price = soup.find_all('span', {'itemprop': 'price'})
        for p in price:
            price = p.get('content')
    except:
        price = ''

    # currency
    try:
        currency = soup.select_one('span[itemprop="priceCurrency"][content]')["content"]
    except:
        currency = ''

    # items sold
    try:
        items_sold = re.findall(r'\d+', soup.select_one('.soldwithfeedback').text)[0]
    except:
        items_sold = ''

    data = {
        'title': title,
        'price': price,
        'currency': currency,
        'total sold': items_sold
    }

    return data


def get_index_data(soup):
    links = soup.select('.sresult h3 a')
    urls = [item.get('href') for item in links]
    return urls


def write_csv(data, url):
    with open('output.csv', 'a') as csvfile:
        writer = csv.writer(csvfile)
        row = [data['title'], data['price'], data['currency'], data['total sold'], url]
        writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
        writer.writerow(row)


def main():
    # Store URL formats for each search engine with placeholders
    url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
    print(url)
    products = get_index_data(get_page(url))

    for link in products:
        time.sleep(0.5)
        data = get_detail_data(get_page(link))
        print(data)
        # write_csv(data, link)  # <-- I commented it, to just print to screen


if __name__ == '__main__':
     main()

印刷:

Enter your product category: ddr4
https://www.ebay.fr/sch/i.html?_nkw=ddr4&_pgn=1
{'title': '16 Go 8 Go 4 Go DDR3 DDR4 1333 1600 1866 2133 RAM 2400 2666 MHz pour HyperX FURY Lot', 'price': '19.74', 'currency': 'USD', 'total sold': '1'}
{'title': '4 Go 8 Go 16 Go DDR4 2133 2400 2666 Mhz pour HyperX FURY DIMM Desktop Mémoire RAM Lot', 'price': '23.87', 'currency': 'USD', 'total sold': '93'}
{'title': '8 Go DDR4 2133 MHz pour HyperX FURY CL15 288 Pin DIMM PC4-17000 Desktop RAM RL1US', 'price': '39.96', 'currency': 'USD', 'total sold': '17'}
{'title': '16 Go G. Skill DDR4 Trident 3200 MHz Z PC4-25600 CL16 1.35 V Double Kit (2x8GB)', 'price': '70.0', 'currency': 'GBP', 'total sold': ''}
{'title': 'DDR4 4 Go 8 Go 16 Go Desktop 2666 MHz Desktop DIMM Mémoire RAM pour Kingston HyperX Fury R1US', 'price': '24.13', 'currency': 'USD', 'total sold': '19'}
{'title': 'Micron 8GB RAM DDR4 1Rx8 PC4-2400T-UAB-10', 'price': '23.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'PATRIOT Viper Blackout 16 Go DDR4 3000 (2x8)', 'price': '54.99', 'currency': 'GBP', 'total sold': ''}
{'title': 'Samsung 8GB RAM DDR4 1Rx8 PC4-2133P SO-DIMM', 'price': '21.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'Kingston 8 Go DDR4 2133 MHz Desktop PC RAM ~~ PC4 17000 Mémoire 2133P 288 broches 2Rx8', 'price': '31.99', 'currency': 'GBP', 'total sold': ''}

...and so on.

推荐阅读