scrapy - Scrapy - 从下一页提取数据
问题描述
我需要你们的帮助来分页和提取元素。这是我的蜘蛛
import json
import scrapy
class YPSpider(scrapy.Spider):
name = 'yp'
start_urls = ['https://www.infobel.com/fr/france/business/50000/informatique_internet/']
def parse(self, response):
next_page_url = response.xpath('//ul[@class="pagination"]/li[@class="active"]/following-sibling::li[1]/a/@href').extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse)
if response.meta.get('has_phone'):
item = response.meta['item']
response = json.loads(response.body)
item['phone'] = response['result']
yield item
else:
items = response.xpath('//*[contains(@class, "customer-box")]')
for item in items:
address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract()
title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip()
address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else ''
village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else ''
phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract()
item = {
'title': title,
'address': address,
'village': village,
'phone': phone,
}
if phone:
if phone[0].isnumeric():
item['phone'] = phone[0]
yield item
elif len(phone) >= 2:
yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True}
)
我怎样才能使这种爬网进入下一个页面并从该页面中抓取元素?感谢您的帮助提前谢谢您
解决方案
您需要将此代码添加到您的parse
方法中:
next_page_url = response.xpath('//ul[@class="pagination"]/li[@class="active"]/following-sibling::li[1]/a/@href').extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse)
在此处提问之前,您需要阅读一些有关 Python 语法的信息:
def parse(self, response):
next_page_url = response.xpath('//ul[@class="pagination"]/li[@class="active"]/following-sibling::li[1]/a/@href').extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse)
if response.meta.get('has_phone'):
item = response.meta['item']
response = json.loads(response.body)
item['phone'] = response['result']
yield item
else:
items = response.xpath('//*[contains(@class, "customer-box")]')
for item in items:
address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract()
title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip()
address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else ''
village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else ''
phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract()
item = {
'title': title,
'address': address,
'village': village,
'phone': phone,
}
if phone:
if phone[0].isnumeric():
item['phone'] = phone[0]
yield item
elif len(phone) >= 2:
yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True}
)
推荐阅读
- go - 如何为 golang 包源创建代理
- postgresql - Postgres width_bucket:函数 width_bucket(bigint, integer[]) 不存在
- 3d - 基于相机旋转和前向矢量的鼠标滚轮缩放功能
- r - 您如何从 R 中的插入符号中提取欠采样数据?
- r - 为什么带有 ranger 引擎的 tidymodels 比 ranger 慢得多?
- r - 基于另一个数据集创建分层样本
- swift - Vapor 4:子关系未急切加载,使用 $ 前缀访问
- angular - 如何使用 HttpClient 以角度将内容发送到 GET 请求的 req.body 中?
- php - 从数据库查询多行,但在gridview中只在一行上获取它们
- c++ - Direct3D 11 访问被拒绝