首页 > 解决方案 > Scrapy 有时会返回“None”,但并非总是如此

问题描述

我创建了一个带有 Rule 对象和 linkextractor 的 Scrapy crawl spider 来抓取 Amazon 畅销产品,但 Scrapy 有时会返回 None 作为结果,即使我确定我的 xpath 表达式是正确的。奇怪的是,Scrapy 偶尔会返回 None 而不是一直返回。这是我的 spider.py 代码

class AmzcrawlSpider(CrawlSpider):

    name = 'amzcrawl'
    allowed_domains = ['amazon.com']
    start_urls = ['https://www.amazon.com/best-sellers-books-Amazon/zgbs/books/ref=zg_bs_unv_b_1_1_1/']


    rules = (
        #THIS RULE IS FOR THE FIRST PAGE OF BESTSELLERS
        Rule(LinkExtractor(restrict_xpaths='//span[@class="zg_selected"]/../following-sibling::ul/li/a'), callback='parse_category', follow=True),

        # THIS RULE IS FOR THE SECOND PAGE OF BESTSELLERS
        Rule(LinkExtractor(restrict_xpaths='//ul[@class="a-pagination"]/li[@class="a-last"]/a'), callback='parse_category', follow=True),
    )

    def parse_category(self, response):
        item = AmzbestsellerItem()
        item['dir_level_1'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/li[@class = "zg_browseUp"]/a/text())').get()
        item['dir_level_2'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/ul/li[@class = "zg_browseUp"]/a/text())').get()
        item['dir_level_3'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/ul/ul/li[@class = "zg_browseUp"]/a/text())').get()
        item['dir_level_4'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/ul/ul/ul/li[@class = "zg_browseUp"]/a/text())').get()
        item['dir_level_5'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/ul/ul/ul/ul/li[@class = "zg_browseUp"]/a/text())').get()

        #NAME OF CURRENT BESTSELLER CATEGORY PAGE
        item['category_name'] = response.xpath('normalize-space(//span[@class="zg_selected"]/text())').get()

        #URL OF CURRENT BESTSELLER CATEGORY PAGE
        item['category_url'] = response.url.split('/ref')[0]

        #THIS CODE IS FOR FOLLOWING INDIVIDUAL PRODUCT PAGE TO GET INFORMATION
        book_containers = response.xpath('//ol[@id="zg-ordered-list" and @class="a-ordered-list a-vertical"]/li')
        for book_dir in book_containers:
            book_dir = book_dir.xpath('./span[@class="a-list-item"]/div[@class="a-section a-spacing-none aok-relative"]/span[@class="aok-inline-block zg-item"]/a[@class="a-link-normal"]/@href').get()
            book_url = response.urljoin(book_dir)
            item['book_url'] = book_url.split('/ref')[0]

            yield Request(book_url, callback=self.parse_book, meta={'item': item}, dont_filter=True)



    #GETTING INDIVIDUAL BOOK DETAIL. THIS IS WHERE PROBLEMS ARISE. I CANNOT GET ALL THE DETAILS EVEN THOUGH THE XPATH EXPRESSIONS ARE CORRECT
    def parse_book(self, response):
        item = response.meta['item']

        item['book_referer'] = response.request.headers.get('Referer', None).decode('utf-8')
        item['title'] = response.xpath('normalize-space(//span[@id="productTitle"])').get()
        item['edition'] = response.xpath('normalize-space(//h1[@id="title" and @class="a-spacing-none a-text-normal"]/span[@id = "productSubtitle" and @class = "a-size-large a-color-secondary"]/text())').get()
        item['author'] = response.xpath('normalize-space(//span[@class="author notFaded"]//a[@class="a-link-normal contributorNameID"]/text() | //span[@class="author notFaded"]/a[@class="a-link-normal"]/text())').getall()
        item['rating_num'] = response.xpath('//div[@id="averageCustomerReviews"]//span[@id="acrCustomerReviewText" and @class="a-size-base"]/text()').get()
        item['img_url'] = response.xpath('//div[@id="main-image-container"]//img/@src').get()
        item['publisher'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"Publisher")]/following-sibling::span/text()').get()
        item['language'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"Language")]/following-sibling::span/text()').get()
        item['isbn10'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"ISBN-10")]/following-sibling::span/text()').get()
        item['isbn13'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"ISBN-13")]/following-sibling::span/text()').get()
        item['asin'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"ASIN")]/following-sibling::span/text()').get()
        item['kindle_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"Kindle")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()
        item['etextbook_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"eTextbook")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()
        item['paperback_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"Paperback")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()
        item['hardcover_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"Hardcover")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()
        item['spiral_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"Spiral-bound")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()


        yield item

我不确定问题出在我的 spider.py 文件还是我的 pipelines.py 文件中。这是我的 pipelines.py 文件代码:


class AmzbestsellerPipeline:
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file, delimiter=";")
        self.exporter.fields_to_export = ['dir_level_1', 'dir_level_2', 'dir_level_3', 'dir_level_4', 'dir_level_5',\
                                          'category_name', 'category_url', 'cat_page_num', 'cat_referer', 'book_url',\
                                          'book_referer', 'title', 'edition', 'author', 'rating_num', 'img_url',\
                                          'publisher', 'language', 'isbn10', 'isbn13', 'asin', \
                                          'kindle_price', 'etextbook_price', 'paperback_price', 'hardcover_price', 'spiral_price']

        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

class DuplicatesPipeline:
    def __init__(self):
        self.ids_seen = set()

    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        if adapter['book_url'] in self.ids_seen:
            raise DropItem(f"Duplicate item found: {item!r}")
        else:
            #self.ids_seen.add(adapter['cat_page_num'])
            self.ids_seen.add(adapter['book_url'])
            return item

标签: pythonweb-scrapingscrapyscrapy-pipeline

解决方案


我已经测试了您的代码,您似乎遇到的错误是:503 Unvailable Service。抓取 Amazon 时的经典错误,最简单的解决方案是使用类似 USER_AGENT"Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 80.0.3987.163 Safari / 537.36"并激活 cookie。

关于 None 的字段,请记住始终在页面源代码上使用 XPATH,而不是在已编译的页面上。在您的情况下,您查询的许多字段不在页面源代码中,因此将显示为无。


推荐阅读