python - Scrapy蜘蛛只刮了2页
问题描述
当我运行这段代码时,蜘蛛只爬了 3 页并停止。它不会转到下一页。
我尝试了不同的方式来改变和改变,但我无法移动到第三页。
# -*- coding: utf-8 -*-
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'rsdata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[@title="Próxima página"]/@href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage, callback=self.parse)
def scrape(self, response):
for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
#item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first()
#item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
#item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first()
#item['price_cond'] = resource.xpath('.//strong[@class="js-condo-rice"]/text()').extract_first()
#item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first()
yield item
解决方案
用。。。来代替 '?' 在“路径”中(注意“下一页”按钮不起作用):
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'rsdata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[contains(@title,"Próxima página")]/@href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
# Got #pagina=2 => Replace with ?pagina=2
path = '?' + path[1:]
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage)
def scrape(self, response):
for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
#item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first()
#item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
#item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first()
#item['price_cond'] = resource.xpath('.//strong[@class="js-condo-rice"]/text()').extract_first()
#item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first()
yield item
部分输出:
{'description': ' Apartamento com 2 Quartos para Aluguel, 82m² '}
{'description': ' Apartamento com 4 Quartos à Venda/Aluguel 280m² '}
{'description': ' Apartamento com 2 Quartos para Aluguel, 70m² '}
{'description': ' Apartamento com 3 Quartos para Aluguel, 113m² '}
{'description': ' Apartamento com 2 Quartos para Venda/Aluguel 50m² '}
{'description': ' Apartamento com 2 Quartos para Venda/Aluguel 50m² '}
Found url: https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/?pagina=27
推荐阅读
- reactjs - 我正在尝试使用 props 将数据从我的子组件发送到父组件,但我收到一条错误消息
- python - 如何将子行添加到由特定列值找到的 QTreeView 行?
- node.js - 使用 node-rdkafka 时主机解析错误
- angular - kendoGridHeaderTemplate 上的 Kendo-Grid 排序
- javascript - Jquery Select 动态添加的元素
- jquery - 标签 ID 在 jQuery 中的 window.location 之后不起作用
- java - 带有异步调用的 Dagger 返回类型
- docker - Traefik.io 和 Docker - 如何只允许对一个特定主机的请求
- sql - Matching on dates by month and year across tables where end of month is different SQL oracle
- ios - Expo如何获取自动生成的ios证书?