python - 我的 Scrapy 代码要么过滤太多，要么重复抓取相同的东西

问题描述

我正在尝试让scrapy-selenium 导航一个url，同时在途中选择一些数据。问题是它似乎过滤掉了太多数据。我相信那里没有那么多数据。我的问题是我不知道在哪里申请dont_filter=True。这是我的代码

import scrapy
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from scrapy_selenium import SeleniumRequest
from shutil import which

class AsusSpider(scrapy.Spider):
    name = 'asus'
    allowed_domains = ['www.zandparts.com']
    # start_urls = ['https://www.zandparts.com/en/gateway']
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_path = which("C:/Users/Hp/Downloads/chromedriver.exe")

    #Starting request from just one url
    def start_requests(self):
        yield scrapy.Request('https://www.zandparts.com/en/gateway', self.parse)

    #grabbing the categories and getting all the links to navigate
    def parse(self,response):
        cat_links = response.xpath("//div[@class='category-navigation']/a")
        for link in cat_links:
            category = link.xpath(".//@href").get()
            cat_text = link.xpath(".//h2/text()").get().strip().replace('\r\n','')

            #making the url absolute
            abs = f"https://www.zandparts.com{category}"

            yield scrapy.Request(url=abs, callback=self.parse_x, meta={'category':cat_text})
    
    #grabbing the series and getting all the links as well
    def parse_x(self, response):
        ser_links = response.xpath("//div[@class='category-navigation']/a")
        for link in ser_links:
            series = link.xpath(".//@href").get()
            ser_text = link.xpath(".//h2/text()").get().strip().replace('\r\n','')
            abs2 = f"https://www.zandparts.com{series}"
            cat1 = response.request.meta['category']
            yield scrapy.Request(url=abs2, callback=self.parse_y, meta={'series':ser_text, 'category2':cat1})

    #grabbing each model and navigating to the product page for all the data
    def parse_y(self, response):
        mod_links = response.xpath("//div[@class='category-navigation']/a")
        for link in mod_links:
            model = link.xpath(".//@href").get()
            mod_text = link.xpath(".//h2/text()").get().strip().replace('\r\n','')
            abs3 = f"https://www.zandparts.com{model}"
            ser1 = response.request.meta['series']
            cat2 = response.request.meta['category2']

            yield scrapy.Request(url=abs3, callback=self.parse_z, meta={'model':mod_text, 'series2':ser1, 'category3':cat2})

    #product page. Getting the data
    def parse_z(self,response):       
        products = response.xpath("//div[@class='product__info']/a")
        next_page = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
        next_page_full = f"http://www.zandparts.com{next_page}"
        # mod2 = response.request.meta['model']
        # ser2 =response.request.meta['series2']
        # cat3 = response.request.meta['category3']
        for product in products:
            link = product.xpath(".//@href").get()

            absolute_url = f"http://www.zandparts.com{link}"

            yield SeleniumRequest(
                                    url = absolute_url,
                                    callback = self.parse_m,
                                    wait_time=10,
                                    wait_until=EC.element_to_be_clickable((By.LINK_TEXT, 'Tillgängliga alternativ')),
                                    meta={'links':absolute_url, 'model':response.request.meta['model'],'series':response.request.meta['series2'],'category':response.request.meta['category3']}
                                )
         #navigating through each page to get the data on each page                       
        if next_page:
            yield scrapy.Request(url=next_page_full, callback=self.parse_z)

    def parse_m(self, response):
        alternate = response.selector.xpath("//div[@class='product__content']/div/a/span/text()").getall()
        category = response.selector.xpath("//div[@class='product-detail']/ul/li[4]").get().strip().replace("\r\n","")
        name = response.selector.xpath("//h1[@class='product-detail__name']/text()").get().strip().replace("\r\n","")
        part = response.selector.xpath("//div[@class='product-detail']/ul/li").get().strip().replace("\r\n","")
        desc = response.selector.xpath("//div[@class='product-detail__description']").get()
        image = response.selector.xpath("//img[@class='product-detail__image--main']/@src").get()
        absolute_image = f"http://www.zandparts.com{image}"

        yield{
            'product link':response.request.meta['links'],
            'category':response.request.meta['category'],
            'series':response.request.meta['series'],
            'model':response.request.meta['model'],
            'product category':category,
            'product name':name,
            'part number':part,
            'description':desc,
            'image link':absolute_image,
            'alt':alternate               
       }

这是我运行代码时的结果：

{'downloader/request_bytes': 688983,
 'downloader/request_count': 1987,
 'downloader/request_method_count/GET': 1987,
 'downloader/response_bytes': 22314989,
 'downloader/response_count': 1987,
 'downloader/response_status_count/200': 1063,
 'downloader/response_status_count/301': 924,
 'dupefilter/filtered': 10704,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2021, 9, 6, 1, 28, 46, 347956),
 'httpcache/hit': 1987,
 'item_scraped_count': 827,
 'log_count/DEBUG': 2816,
 'log_count/ERROR': 97,
 'log_count/INFO': 10,
 'log_count/WARNING': 1,
 'request_depth_max': 4,
 'response_received_count': 1063,
 'robotstxt/request_count': 1,
 'robotstxt/response_count': 1,
 'robotstxt/response_status_count/200': 1,
 'scheduler/dequeued': 1986,
 'scheduler/dequeued/memory': 1986,
 'scheduler/enqueued': 1986,
 'scheduler/enqueued/memory': 1986,
 'spider_exceptions/KeyError': 97,
 'start_time': datetime.datetime(2021, 9, 6, 1, 28, 22, 201511)}
2021-09-06 01:28:46 [scrapy.core.engine] INFO: Spider closed (finished)

我似乎弄错了，因为它过滤掉了太多，并且没有获取所有数据。我似乎也对元数据有问题，因为一旦进入下一页，我就会得到一个 KeyError。

标签： pythonweb-scrapingscrapyscrapy-selenium

解决方案

我在干净的虚拟环境中运行您的代码，它按预期工作。它也没有给我一个KeyError但在各种 xpath 路径上有一些问题。我不太清楚你过滤掉太多数据是什么意思，但你的代码给了我这个输出：

您可以通过更改 xpath 变量来修复文本错误（on和） product category，如下所示：part numberdescription

alternate = response.selector.xpath("//div[@class='product__content']/div/a/span/text()").getall()
category = response.selector.xpath("//div[@class='product-detail']/ul/li[4]/text()[2]").get().strip()
name = response.selector.xpath("//h1[@class='product-detail__name']/text()").get().strip()
part = response.selector.xpath("//div[@class='product-detail']/ul/li/text()[2]").get().strip()
desc = response.selector.xpath("//div[@class='product-detail__description']/text()").get().replace("\r\n","").strip()
image = response.selector.xpath("//img[@class='product-detail__image--main']/@src").get()
absolute_image = f"http://www.zandparts.com{image}"

所以你可以获得更清晰的输出：

编辑：
如何更改parse函数的变量并清除meta请求的信息？

注意到global变量了吗？我使用了它们而不是它们的meta值。（我也添加dont_filter=True到start_requests函数中。）您可以将这些全局变量实现到 result yield。

我得到了这样的结果：

如果这些解决方案不能满足您的问题，我们可以进一步讨论。

python - 我的 Scrapy 代码要么过滤太多，要么重复抓取相同的东西

问题描述

解决方案

推荐阅读