python - 将scrapy-splash与 crawlera 一起使用时出现 504 超时异常
问题描述
我尝试使用http://www.google.com的scrapy-splash并遵循以下 Github Repo https://github.com/scrapy-plugins/scrapy-splash中给出的所有先决条件步骤,我能够渲染 Google页。
但是,当我通过以下 Github Repo https://github.com/scrapinghub/sample-projects/tree/master/splash_crawlera_example中提到的将crawlera 与 scrapy-splash集成来厌倦相同的http://www.google.com时,我总是收到 504 超时异常
splash_crawlera_example中提到的默认示例 url http://quotes.toscrape.com/js/已成功通过 crawlera 渲染,但不是 Google,是否需要使用脚本更改任何内容以渲染 Google 页面?
这是引号-js.py
from pkgutil import get_data
import scrapy
from scrapy_splash import SplashRequest
from w3lib.http import basic_auth_header
class QuotesJsSpider(scrapy.Spider):
name = 'quotes-js'
def __init__(self, *args, **kwargs):
# to be able to load the Lua script on Scrapy Cloud, make sure your
# project's setup.py file contains the "package_data" setting, similar
# to this project's setup.py
self.LUA_SOURCE = get_data(
'splash_crawlera_example', 'scripts/crawlera.lua'
).decode('utf-8')
super(QuotesJsSpider, self).__init__(*args, **kwargs)
def start_requests(self):
yield SplashRequest(
# url='http://quotes.toscrape.com/js/',
url='http://www.google.com',
endpoint='execute',
splash_headers={
'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
},
args={
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
'wait': 0.5, 'viewport': '1024x2480', 'images': 0, 'timeout': 90
},
# tell Splash to cache the lua script, to avoid sending it for every request
cache_args=['lua_source'],
)
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('span small::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
next_page = response.css('li.next > a::attr(href)').extract_first()
if next_page:
yield SplashRequest(
url=response.urljoin(next_page),
endpoint='execute',
splash_headers={
'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
},
args={
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
},
cache_args=['lua_source'],
)
设置.py
# -*- coding: utf-8 -*-
BOT_NAME = 'splash_crawlera_example'
SPIDER_MODULES = ['splash_crawlera_example.spiders']
NEWSPIDER_MODULE = 'splash_crawlera_example.spiders'
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DOWNLOADER_MIDDLEWARES = {
'scrapy_crawlera.CrawleraMiddleware': 300,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
CRAWLERA_APIKEY = '' # Your crawlera API key
# Splash settings
SPLASH_URL = 'http://localhost:8050/' # Splash instance URL from Scrapy Cloud
SPLASH_APIKEY = '' # Your API key for the Splash instance hosted on Scrapy Cloud
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 100
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 1800
DOENLOAD_DELAY = 1
DEFAULT_HEADERS = {
'X-Crawlera-Max-Retries': 0
}
爬虫.lua
function use_crawlera(splash)
-- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
-- Have a look at the file spiders/quotes-js.py to see how to do it.
-- Find your Crawlera credentials in https://app.scrapinghub.com/
local user = splash.args.crawlera_user
local host = 'proxy.crawlera.com'
local port = 8010
local session_header = 'X-Crawlera-Session'
local session_id = 'create'
splash:on_request(function (request)
-- The commented code below can be used to speed up the crawling
-- process. They filter requests to undesired domains and useless
-- resources. Uncomment the ones that make sense to your use case
-- and add your own rules.
-- Discard requests to advertising and tracking domains.
-- if string.find(request.url, 'doubleclick%.net') or
-- string.find(request.url, 'analytics%.google%.com') then
-- request.abort()
-- return
-- end
-- Avoid using Crawlera for subresources fetching to increase crawling
-- speed. The example below avoids using Crawlera for URLS starting
-- with 'static.' and the ones ending with '.png'.
-- if string.find(request.url, '://static%.') ~= nil or
-- string.find(request.url, '%.png$') ~= nil then
-- return
-- end
request:set_header('X-Crawlera-Cookies', 'disable')
request:set_header(session_header, session_id)
request:set_proxy{host, port, username=user, password=''}
end)
splash:on_response_headers(function (response)
if type(response.headers[session_header]) ~= nil then
session_id = response.headers[session_header]
end
end)
end
function main(splash)
use_crawlera(splash)
splash:go(splash.args.url)
return splash:html()
end
解决方案
推荐阅读
- ios - 有没有办法使用 Swift 调整文本大小以适应 UITextView?我看过几年前的一些例子,但它们是用 Objective-C 编写的
- visual-studio - ReSharper CLT 和 ReSpeller 与 NuGet
- javascript - JS 到 jQuery 的转换
- mqtt - 为什么我用 PAHO mqtt 发话题的时候,出现了 return-8?
- javascript - 我不断收到解析错误:相邻的 JSX 元素必须包含在封闭标记中
- python - 将树节点的路径打印为列表
- perl - eval 不打印“Hello World”并且仍然输出到 stderr
- css - 将 flexbox 元素粘贴到容器的每一侧(一个元素在左侧,另一个在右侧)
- c - 如何逐位获取任意十进制数
- python - 无法通过 Windows 终端导入 django