python - Scrapy 只产生最后一个 url 项
问题描述
我正在编写一个简单的网络爬虫来浏览亚马逊页面并获取书籍详细信息。为此,我使用 Selenium 来获取 JS 生成的内容。它遍历 ASIN 列表,但只获取最后一个 ASIN 标题和书籍信息,并重复它的次数与我拥有 ASIN 的次数一样多。我不知道为什么yield
每个网址都有效。这是源代码:
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['amazon.com']
# list of ASIN to append to append to URL
list_url = ['B075QL36RW', 'B01ISNIKES', 'B06XG27KV2', 'B00IWGRPRK', 'B00NS42GFW', 'B0178USZ88', 'B00KWGOBQQ', 'B07FXXM638']
def start_requests(self):
self.driver = webdriver.Chrome('/path/to/chromedriver')
for url in self.list_url:
link = f'https://www.amazon.com/dp/{url}'
self.driver.get(link)
yield scrapy.Request(link, self.parse_book)
def parse_book(self, response):
sel = Selector(text=self.driver.page_source)
title_raw = sel.xpath('//*[@id="productTitle"]/text()').extract()
info_raw = sel.xpath('//*[@id="bookDescription_feature_div"]/noscript').extract()
title = ' '.join(''.join(title_raw).split())
info = ' '.join(''.join(info_raw).split())
cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
cleantext = re.sub(cleanr, '', info)
yield {
'title': title,
'info': cleantext
}
解决方案
以下是重构代码如何帮助解决问题:
list_of_urls = ['B075QL36RW', 'B01ISNIKES', 'B06XG27KV2', 'B00IWGRPRK', 'B00NS42GFW']
asin = iter(list_of_urls)
def start_requests(self):
self.driver = webdriver.Chrome('path/to/driver')
self.driver.get('https://amazon.com/dp/B06xt7gkb1')
sel = Selector(text=self.driver.page_source)
url = 'https://amazon.com/dp/B06xt7gkb1'
yield Request(url, callback=self.parse_book)
while True:
next_url = f'https://amazon.com/dp/{next(self.asin)}'
self.driver.get(next_url)
sel = Selector(text=self.driver.page_source)
yield Request(next_url, callback=self.parse_book)
def parse_book(self, response):
...