首页 > 解决方案 > Scrapy蜘蛛只刮了2页

问题描述

当我运行这段代码时,蜘蛛只爬了 3 页并停止。它不会转到下一页。

我尝试了不同的方式来改变和改变,但我无法移动到第三页。

# -*- coding: utf-8 -*-
import scrapy
from realstatedata.items import RealstatedataItem

class RsdataSpider(scrapy.Spider):
    name = 'rsdata'
    allowed_domains = ['vivareal.com.br']
    start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']

    def parse(self, response):

        nextpageurl = response.xpath('//a[@title="Próxima página"]/@href')
        yield from self.scrape(response)

        if nextpageurl:
            path = nextpageurl.extract_first()
            nextpage = response.urljoin(path)
            print("Found url: {}".format(nextpage))
            yield scrapy.Request(nextpage, callback=self.parse)

    def scrape(self, response):
        for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):

            item = RealstatedataItem()

            item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
            #item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first()
            #item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
            #item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first()
            #item['price_cond'] = resource.xpath('.//strong[@class="js-condo-rice"]/text()').extract_first()
            #item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first()

            yield item

标签: pythonscrapy

解决方案


用。。。来代替 '?' 在“路径”中(注意“下一页”按钮不起作用):

import scrapy
from realstatedata.items import RealstatedataItem

class RsdataSpider(scrapy.Spider):
    name = 'rsdata'
    allowed_domains = ['vivareal.com.br']
    start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']

    def parse(self, response):

        nextpageurl = response.xpath('//a[contains(@title,"Próxima página")]/@href')
        yield from self.scrape(response)

        if nextpageurl:
            path = nextpageurl.extract_first()
            # Got #pagina=2   =>    Replace with ?pagina=2
            path = '?' + path[1:]
            nextpage = response.urljoin(path)
            print("Found url: {}".format(nextpage))
            yield scrapy.Request(nextpage)

    def scrape(self, response):
        for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):

            item = RealstatedataItem()

            item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
            #item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first()
            #item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
            #item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first()
            #item['price_cond'] = resource.xpath('.//strong[@class="js-condo-rice"]/text()').extract_first()
            #item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first()

            yield item

部分输出:

{'description': '  Apartamento com 2 Quartos para Aluguel, 82m²  '}
{'description': '  Apartamento com 4 Quartos à Venda/Aluguel 280m²  '}
{'description': '  Apartamento com 2 Quartos para Aluguel, 70m²  '}
{'description': '  Apartamento com 3 Quartos para Aluguel, 113m²  '}
{'description': '  Apartamento com 2 Quartos para Venda/Aluguel 50m²  '}
{'description': '  Apartamento com 2 Quartos para Venda/Aluguel 50m²  '}
Found url: https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/?pagina=27

推荐阅读