python - Python BeautifulSoup IndexError:列表索引超出范围
问题描述
import requests
from bs4 import BeautifulSoup
import csv
import time
class LightupScraper:
results = []
def fetch(self, url):
print(f'HTTP GET request to URL: {url}', end='')
res = requests.get(url)
print(f' | Status Code: {res.status_code}')
return res
def save_response(self, res):
with open('res.html', 'w') as html_file:
html_file.write(res)
def load_response(self):
html = ''
with open('res.html', 'r') as html_file:
for line in html_file:
html += line
return html
def parse(self, html):
content = BeautifulSoup(html, 'lxml')
titles = [title.text.strip() for title in content.find_all('h4', {'class': 'card-title ols-card-title'})]
links = [link.find('a')['href'] for link in content.find_all('h4', {'class': 'card-title ols-card-title'})]
skus = [sku.text for sku in content.find_all('span', {'class': 'productView-info-value ols-card-text--sku'})]
mpn = [mpn.text.split(':')[-1].strip() for mpn in content.find_all('span', {'class': 'productView-info-name mpn-label ols-card-text--mpn'})]
details = [ul.find_all('li') for ul in content.find_all('ul', {'class': 'ols-card-text__list'})]
brand = [''.join([brand.text for brand in detail if 'Brand:' in brand.text]).split(':')[-1].strip() for detail in details]
base = [''.join([base.text for base in detail if 'Base Type:' in base.text]).split(':')[-1].strip() for detail in details]
life_hours = [''.join([life_hour.text for life_hour in detail if 'Life Hours:' in life_hour.text]).split(':')[-1].strip() for detail in details]
lumens = [''.join([lumen.text for lumen in detail if 'Lumens:' in lumen.text]).split(':')[-1].strip() for detail in details]
warrantys = [''.join([warranty.text for warranty in detail if 'Warranty:' in warranty.text]).split(':')[-1].strip() for detail in details]
wattages = [''.join([wattage.text for wattage in detail if 'Wattage:' in wattage.text]).split(':')[-1].strip() for detail in details]
features = [feature.text.split() for feature in content.find_all('span', {'class': 'ols-card-text__list--features'})]
prices = [price.text for price in content.find_all('span', {'class': 'price price--withoutTax'})]
for feature in features:
feat = feature
for item in range(0, len(titles)):
self.results.append({
'titles': titles[item],
'skus': skus[item],
'mpn': mpn[item],
'brand': brand[item],
'base': base[item],
'life_hours': life_hours[item],
'lumens': lumens[item],
'warrantys': warrantys[item],
'wattages': wattages[item],
'feature': feat[item],
'links': links[item],
'price': prices[item]
})
def to_csv(self):
with open('lightup.csv', 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
writer.writeheader()
for row in self.results:
writer.writerow(row)
print('Exported results to lightup.csv')
def run(self):
page_num = 3
for page in range(1, page_num + 1):
base_url = 'https://www.lightup.com/standard-household-lighting.html?p='
base_url += str(page)
res = self.fetch(base_url)
self.parse(res.text)
self.to_csv()
if __name__ == '__main__':
scraper = LightupScraper()
scraper.run()
错误:
File "lightup_scraper.py", line 66, in parse
'price': prices[item]
IndexError: list index out of range
我正在尝试抓取价格,但出现列表索引超出范围错误,因为负责价格的标签返回 14 个元素,而其他标签返回 16,这是因为某些价格标签不同,例如每个案例标签的价格是价格 price--withoutTax price-per-case 和单个产品价格 price--withoutTax。我尝试尝试除了块,但没有运气,它给了我另一个列表,而不是单独的价格,我无法理解这个问题可能是有人可以给我一些指示以实际完成这项工作。
解决方案
你可能会做这样的事情:
for item in range(0, len(titles)):
default = {
'titles': "x",
'skus': "x",
'mpn': "x",
'brand': "x",
'base': "x",
'life_hours': "x",
'lumens': "x",
'warrantys': "x",
'wattages': "x",
'feature': "x",
'links': "x",
'price': "x"
}
default['titles'] = titles[item]
default['skus'] = skus[item]
default['mpn'] = mpn[item]
default['brand'] = brand[item]
default['base'] = base[item]
default['life_hours'] = life_hours[item]
default['lumens'] = lumens[item]
default['warrantys'] = warrantys[item]
default['wattages'] = wattages[item]
default['feature'] = feature[item]
default['links'] = links[item]
try:
default['price'] = prices[item]
except:
pass
self.results.append(default)
它不漂亮,但它有效。
问题虽然; 使用您将所有东西都放在列表中的方法,您怎么知道哪个产品缺少价格?
也许添加到 csv 的循环或为每个产品构建某种 dict 可能是更好的方法?
推荐阅读
- arduino - 为什么同步到同一 NTP 服务器的 MAC 和 ESP32 (Arduino) 之间的秒数字段会有大约 30,000 的差异?
- python-3.x - 无法在 Python 3 (MacOS) 上导入 matplotlib.pyplot
- python-3.x - ImportError:DLL 加载失败:找不到指定的过程。(导入sklearn)
- java - 查找所有包含最大值的索引
- ruby - `require': 不能加载这样的文件——curses (LoadError)
- java - 比较对象的更详尽的方法
- php - 如何解决错误“无法加载 PEM 客户端证书,OpenSSL 错误:02001003:system library:fopen:No such process”?
- php - 是否可以在 preg_match 中排除部分匹配字符串?
- php - 如何使用 DOMDocument 从网站 url 读取 JSON
- java - 如何在堆排序中使用 ArrayList