python - 使用 Scrapy 抓取没有“下一步按钮”的表数据
问题描述
我是新手Scrapy
,我尝试从该网站的每个页面获取表格数据。
这是我的代码:
import scrapy
class UAESpider(scrapy.Spider):
name = 'uae_free'
allowed_domains = ['https://www.uaeonlinedirectory.com']
start_urls = [
'https://www.uaeonlinedirectory.com/UFZOnlineDirectory.aspx?item=A'
]
def parse(self, response):
pages = response.xpath('//table[@class="GridViewStyle"]//tr[12]')
for page in pages[1:11]:
rows = page.xpath('//table[@class="GridViewStyle"]//tr')
for row in rows[1:11]:
yield {
'company_name': row.xpath('.//td[2]//text()').get(),
'company_name_link': row.xpath('.//td[2]//a/@href').get(),
'zone': row.xpath('.//td[4]//text()').get(),
'category': row.xpath('.//td[6]//text()').get(),
'category_link': row.xpath('.//td[6]//a/@href').get()
}
next_page = response.xpath('//table[@class="GridViewStyle"]//tr[12]//td[11]//a/@href').get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
但它不起作用,我收到这个错误,下面的 URL 是链接page 11
:
ValueError: Missing scheme in request url: javascript:__doPostBack('ctl00$ContentPlaceHolder2$grdDirectory','Page$11')
大佬们知道怎么修复bug吗?
更新:
按照@zmike建议的this answer中的说明进行操作,这是我到目前为止所做的:
import scrapy
from scrapy.http import FormRequest
URL = 'https://www.uaeonlinedirectory.com/UFZOnlineDirectory.aspx?item=A'
class UAESpider(scrapy.Spider):
name = 'uae_free'
allowed_domains = ['https://www.uaeonlinedirectory.com/UFZOnlineDirectory.aspx?item=A']
start_urls = [
'https://www.uaeonlinedirectory.com/UFZOnlineDirectory.aspx?item=A'
]
def parse(self, response):
self.data = {}
for form_input in response.css('form#aspnetForm input'):
name = form_input.xpath('@name').extract()[0]
try:
value = form_input.xpath('@value').extract()[0]
except IndexError:
value = ""
self.data[name] = value
self.data['ctl00_ContentPlaceHolder2_panelGrid'] = 'ctl00$ContentPlaceHolder2$grdDirectory'
self.data['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder2$grdDirectory'
self.data['__EVENTARGUMENT'] = 'Page$1'
return FormRequest(url=URL,
method='POST',
callback=self.parse_page,
formdata=self.data,
meta={'page':1},
dont_filter=True)
def parse_page(self, response):
current_page = response.meta['page'] + 1
rows = response.xpath('//table[@class="GridViewStyle"]//tr')
for row in rows[1:11]:
yield {
'company_name': row.xpath('.//td[2]//text()').get(),
'company_name_link': row.xpath('.//td[2]//a/@href').get(),
'zone': row.xpath('.//td[4]//text()').get(),
'category': row.xpath('.//td[6]//text()').get(),
'category_link': row.xpath('.//td[6]//a/@href').get()
}
return FormRequest(url=URL,
method='POST',
formdata={
'__EVENTARGUMENT': 'Page$%d' % current_page,
'__EVENTTARGET': 'ctl00$ContentPlaceHolder2$grdDirectory',
'ctl00_ContentPlaceHolder2_panelGrid':'ctl00$ContentPlaceHolder2$grdDirectory',
'':''},
meta={'page': current_page},
dont_filter=True)
并且此代码仅从第一页获取表格数据,不会移动到剩余页面。你知道我哪里做错了吗?
解决方案
这是遍历所有页面的爬虫的工作(尽管很粗糙)实现。一些注意事项:
- 表单数据需要不同的参数,例如
__EVENTTARGET
、__EVENTVALIDATION
、__VIEWSTATEGENERATOR
等。- 我使用 XPath 而不是正则表达式来获取它们
- 以下是不必要的:
self.data['ctl00_ContentPlaceHolder2_panelGrid'] = 'ctl00$ContentPlaceHolder2$grdDirectory'
- 为了简单起见,我组合了这些功能。回调允许它遍历所有页面。
import scrapy
from scrapy.http import FormRequest
class UAESpider(scrapy.Spider):
name = 'uae_free'
headers = {
'X-MicrosoftAjax': 'Delta=true',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36'
}
allowed_domains = ['www.uaeonlinedirectory.com']
# TODO: Include the urls for all other items (e.g. A-Z)
start_urls = ['https://www.uaeonlinedirectory.com/UFZOnlineDirectory.aspx?item=A']
current_page = 0
def parse(self, response):
# request the next page
self.current_page = self.current_page + 1
if self.current_page == 1:
# submit a form (first page)
data = {}
for form_input in response.css('form#aspnetForm input'):
name = form_input.xpath('@name').extract()[0]
try:
value = form_input.xpath('@value').extract()[0]
except IndexError:
value = ""
data[name] = value
data['__EVENTTARGET'] = 'ctl00$MainContent$List'
data['__EVENTARGUMENT'] = 'Page$1'
else:
# Extract the form fields and arguments using XPATH
event_validation = response.xpath('//input[@id="__EVENTVALIDATION"]/@value').extract()
view_state = response.xpath('//input[@id="__VIEWSTATE"]/@value').extract()
view_state_generator = response.xpath('//input[@id="__VIEWSTATEGENERATOR"]/@value').extract()
view_state_encrypted = response.xpath('//input[@id="__VIEWSTATEENCRYPTED"]/@value').extract()
data = {
'__EVENTTARGET': 'ctl00$ContentPlaceHolder2$grdDirectory',
'__EVENTARGUMENT': 'Page$%d' % self.current_page,
'__EVENTVALIDATION': event_validation,
'__VIEWSTATE': view_state,
'__VIEWSTATEGENERATOR': view_state_generator,
'__VIEWSTATEENCRYPTED': view_state_encrypted,
'__ASYNCPOST': 'true',
'': ''
}
# Yield the companies
# TODO: move this to a different function
rows = response.xpath('//table[@class="GridViewStyle"]//tr')
for row in rows[1:11]:
result = {
'company_name': row.xpath('.//td[2]//text()').get(),
'company_name_link': row.xpath('.//td[2]//a/@href').get(),
'zone': row.xpath('.//td[4]//text()').get(),
'category': row.xpath('.//td[6]//text()').get(),
'category_link': row.xpath('.//td[6]//a/@href').get()
}
print(result)
yield result
# TODO: check if there is a next page, and only yield if there is one
yield FormRequest(url=self.start_urls[0], # TODO: change this so that index is not hardcoded
method='POST',
formdata=data,
callback=self.parse,
meta={'page': self.current_page},
dont_filter=True,
headers=self.headers)
推荐阅读
- arrays - UDF 使用数组拆分列中的字符串
- mysql - 如何在 MySQL 中的两个表之间进行两次连接,以使它们相互链接?
- javascript - Javascript 密码反馈
- python - 如何根据空格数拆分数据框中的字符串
- javascript - 在 JS React 中,本地 img 不通过数组导入
- c++ - c++ 将指针推入指针优先级队列会导致立即 valgrind 错误
- r - 如何在 Debian 中安装 poppler 0.73.0 和 pdftools?
- sql - SQL 如何根据某些条件生成约会时间表?
- python - 保存时 Python PIL 边框为 16x16 图像
- excel - 检查命名范围是否等于另一个命名范围