首页 > 解决方案 > Python BeautifulSoup IndexError:列表索引超出范围

问题描述

import requests
from bs4 import BeautifulSoup
import csv
import time

class LightupScraper:

    results = []

    def fetch(self, url):
        print(f'HTTP GET request to URL: {url}', end='')
        res = requests.get(url)
        print(f' | Status Code: {res.status_code}')

        return res

    def save_response(self, res):
        with open('res.html', 'w') as html_file:
              html_file.write(res)

    def load_response(self):
          html = ''

          with open('res.html', 'r') as html_file:
                for line in html_file:
                     html += line

          return html

    def parse(self, html):

        content = BeautifulSoup(html, 'lxml')
        titles = [title.text.strip() for title in content.find_all('h4', {'class': 'card-title ols-card-title'})]
        links = [link.find('a')['href'] for link in content.find_all('h4', {'class': 'card-title ols-card-title'})]
        skus = [sku.text for sku in content.find_all('span', {'class': 'productView-info-value ols-card-text--sku'})]
        mpn = [mpn.text.split(':')[-1].strip() for mpn in content.find_all('span', {'class': 'productView-info-name mpn-label ols-card-text--mpn'})]
        details = [ul.find_all('li') for ul in content.find_all('ul', {'class': 'ols-card-text__list'})]
        brand = [''.join([brand.text for brand in detail if 'Brand:' in brand.text]).split(':')[-1].strip() for detail in details]
        base = [''.join([base.text for base in detail if 'Base Type:' in base.text]).split(':')[-1].strip() for detail in details]
        life_hours = [''.join([life_hour.text for life_hour in detail if 'Life Hours:' in life_hour.text]).split(':')[-1].strip() for detail in details]
        lumens = [''.join([lumen.text for lumen in detail if 'Lumens:' in lumen.text]).split(':')[-1].strip() for detail in details]
        warrantys = [''.join([warranty.text for warranty in detail if 'Warranty:' in warranty.text]).split(':')[-1].strip() for detail in details]
        wattages = [''.join([wattage.text for wattage in detail if 'Wattage:' in wattage.text]).split(':')[-1].strip() for detail in details]
        features = [feature.text.split() for feature in content.find_all('span', {'class': 'ols-card-text__list--features'})]
        prices = [price.text for price in content.find_all('span', {'class': 'price price--withoutTax'})]



        for feature in features:
             feat = feature

        for item in range(0, len(titles)):
             self.results.append({
              'titles': titles[item],
              'skus': skus[item],
              'mpn': mpn[item],
              'brand': brand[item],
              'base': base[item],
              'life_hours': life_hours[item],
              'lumens': lumens[item],
              'warrantys': warrantys[item],
              'wattages': wattages[item],
              'feature': feat[item],
              'links': links[item],
              'price': prices[item]
          })

    def to_csv(self):
        with open('lightup.csv', 'w', newline='') as csv_file:
              writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
              writer.writeheader()

            for row in self.results:
                 writer.writerow(row)

            print('Exported results to lightup.csv')

    def run(self):
     
       page_num = 3

       for page in range(1, page_num + 1):
            base_url = 'https://www.lightup.com/standard-household-lighting.html?p='
            base_url += str(page)
            res = self.fetch(base_url)
            self.parse(res.text)


    self.to_csv()



 if __name__ == '__main__':
     scraper = LightupScraper()
     scraper.run()

错误:

File "lightup_scraper.py", line 66, in parse
'price': prices[item]
IndexError: list index out of range

我正在尝试抓取价格,但出现列表索引超出范围错误,因为负责价格的标签返回 14 个元素,而其他标签返回 16,这是因为某些价格标签不同,例如每个案例标签的价格是价格 price--withoutTax price-per-case 和单个产品价格 price--withoutTax。我尝试尝试除了块,但没有运气,它给了我另一个列表,而不是单独的价格,我无法理解这个问题可能是有人可以给我一些指示以实际完成这项工作。

标签: pythonpython-3.xlistweb-scrapingbeautifulsoup

解决方案


你可能会做这样的事情:

for item in range(0, len(titles)):
        default = {
            'titles': "x",
            'skus': "x",
            'mpn': "x",
            'brand': "x",
            'base': "x",
            'life_hours': "x",
            'lumens': "x",
            'warrantys': "x",
            'wattages': "x",
            'feature': "x",
            'links': "x",
            'price': "x"
        }
        default['titles'] = titles[item]
        default['skus'] = skus[item]
        default['mpn'] = mpn[item]
        default['brand'] = brand[item]
        default['base'] = base[item]
        default['life_hours'] = life_hours[item]
        default['lumens'] = lumens[item]
        default['warrantys'] = warrantys[item]
        default['wattages'] = wattages[item]
        default['feature'] = feature[item]
        default['links'] = links[item]
        try:
            default['price'] = prices[item]
        except:
            pass
        self.results.append(default)

它不漂亮,但它有效。

问题虽然; 使用您将所有东西都放在列表中的方法,您怎么知道哪个产品缺少价格?

也许添加到 csv 的循环或为每个产品构建某种 dict 可能是更好的方法?


推荐阅读