python-3.x - 选择带有关键字的页面并抓取它
问题描述
我正在尝试在网站的外国版本上抓取项目的标题。运行 Python 脚本后,cli 启动但什么也不返回。
在 iPython 中,要获得标题,title = soup.find('a', {'class': 'vip'}).text
它本身就可以很好地工作,但在完整代码中的 Pycharm 中它并没有,即使我去我的设置下载了我当前解释器的 BeautifulSoup 包。
知道为什么吗?谢谢。
#!/usr/bin/python3
import csv
import time
import requests
from bs4 import BeautifulSoup
product_category = input("Enter your product category: ")
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_detail_data(soup):
# title
try:
title = soup.find('a', {'class': 'vip'}).text
except:
title = ''
# price
try:
price = soup.find_all('span', {'itemprop': 'price'})
for p in price:
price = p.get('content')
except:
price = ''
# currency
try:
currency = soup.find_all('span', {'class': 'bold'}).text
except:
currency = ''
# items sold
try:
i_s = soup.find('div', {'class': 'hotness-signal red'}).text
items_sold = i_s.strip().split(' ')[0]
except:
items_sold = ''
data = {
'title': title,
'price': price,
'currency': currency,
'total sold': items_sold
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a', class_='s-item__link')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('output.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['total sold'], url]
writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
writer.writerow(row)
def main():
# Store URL formats for each search engine with placeholders
url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
print(url)
products = get_index_data(get_page(url))
for link in products:
time.sleep(7)
data = get_detail_data(get_page(link))
print(data)
write_csv(data, link)
if __name__ == '__main__':
main()
解决方案
似乎 .fr 站点使用不同的标记,因此您需要相应地更改类名/属性。
例如:
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
product_category = input("Enter your product category: ")
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_detail_data(soup):
# title
try:
title = soup.select_one('h1[itemprop="name"]')
for span in title.select('span'):
span.extract()
title = title.get_text(strip=True)
except:
title = ''
# price
try:
price = soup.find_all('span', {'itemprop': 'price'})
for p in price:
price = p.get('content')
except:
price = ''
# currency
try:
currency = soup.select_one('span[itemprop="priceCurrency"][content]')["content"]
except:
currency = ''
# items sold
try:
items_sold = re.findall(r'\d+', soup.select_one('.soldwithfeedback').text)[0]
except:
items_sold = ''
data = {
'title': title,
'price': price,
'currency': currency,
'total sold': items_sold
}
return data
def get_index_data(soup):
links = soup.select('.sresult h3 a')
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('output.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['total sold'], url]
writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
writer.writerow(row)
def main():
# Store URL formats for each search engine with placeholders
url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
print(url)
products = get_index_data(get_page(url))
for link in products:
time.sleep(0.5)
data = get_detail_data(get_page(link))
print(data)
# write_csv(data, link) # <-- I commented it, to just print to screen
if __name__ == '__main__':
main()
印刷:
Enter your product category: ddr4
https://www.ebay.fr/sch/i.html?_nkw=ddr4&_pgn=1
{'title': '16 Go 8 Go 4 Go DDR3 DDR4 1333 1600 1866 2133 RAM 2400 2666 MHz pour HyperX FURY Lot', 'price': '19.74', 'currency': 'USD', 'total sold': '1'}
{'title': '4 Go 8 Go 16 Go DDR4 2133 2400 2666 Mhz pour HyperX FURY DIMM Desktop Mémoire RAM Lot', 'price': '23.87', 'currency': 'USD', 'total sold': '93'}
{'title': '8 Go DDR4 2133 MHz pour HyperX FURY CL15 288 Pin DIMM PC4-17000 Desktop RAM RL1US', 'price': '39.96', 'currency': 'USD', 'total sold': '17'}
{'title': '16 Go G. Skill DDR4 Trident 3200 MHz Z PC4-25600 CL16 1.35 V Double Kit (2x8GB)', 'price': '70.0', 'currency': 'GBP', 'total sold': ''}
{'title': 'DDR4 4 Go 8 Go 16 Go Desktop 2666 MHz Desktop DIMM Mémoire RAM pour Kingston HyperX Fury R1US', 'price': '24.13', 'currency': 'USD', 'total sold': '19'}
{'title': 'Micron 8GB RAM DDR4 1Rx8 PC4-2400T-UAB-10', 'price': '23.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'PATRIOT Viper Blackout 16 Go DDR4 3000 (2x8)', 'price': '54.99', 'currency': 'GBP', 'total sold': ''}
{'title': 'Samsung 8GB RAM DDR4 1Rx8 PC4-2133P SO-DIMM', 'price': '21.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'Kingston 8 Go DDR4 2133 MHz Desktop PC RAM ~~ PC4 17000 Mémoire 2133P 288 broches 2Rx8', 'price': '31.99', 'currency': 'GBP', 'total sold': ''}
...and so on.
推荐阅读
- azure - 错误(未找到 Azure DevOps Services 或 Team Foundation Server 存储库配置。确保)共享项目时
- sql - SQL Server - 使用 GROUP BY 子句的 SUM 和逗号分隔值
- react-native - React Native Vector Icons 不会显示在 Android 的底部选项卡导航中
- powershell - 将输入对象传递给 ConvertTo-Html
- python - 安装 snowflake-connector-python 时出错
- php - HTTP POST curl php时的GDMS(Grandstream)API问题
- c - Flawfinder 检测到的修复(CWE-120、CWE-20)
- database - 为项目和成本用例选择最佳数据库
- c# - System.IO.FileCopy 有时会抛出“没有足够的存储空间来处理此命令”错误消息
- reactjs - 使用单个 shell 脚本运行 react 和 flask