python - 为什么我的 Scrapy 蜘蛛只抓取我的一些数据?
问题描述
我正在尝试使用 Scrapy 为Law & Order: SVU的每一集抓取 IMDb 数据(剧集信息和演员表) 。运行下面的代码后,我通过命令行使用“scrapy crawl svu -o svu.csv”将其导出为 CSV。
下面的代码成功提取剧集信息,但 CSV 不包含演员表。如何修复代码以提取和导出剧集信息和演员表?
我的想法和尝试:
- 我相信演员表是被提取出来的,因为它在蜘蛛运行时在终端中是可见的,所以它可能是一个导出问题。
- 如果我注释掉我的第一个 Yield 语句(剧集信息),则成功导出演员表。这让我觉得这不仅仅是一个出口问题。
谢谢您的帮助!
import scrapy
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
yield {
'season': response.xpath("//div[@class='bp_heading']/text()")[0].extract(),
'episode': response.xpath("//div[@class='bp_heading']/text()")[1].extract(),
'episode_name': response.xpath("//h1[@itemprop='name']/text()").extract_first().strip(),
'date_published': response.xpath("//div[@class='subtext']/a/meta[@itemprop='datePublished']/@content").extract(),
'rating_value': response.xpath("//span[@itemprop='ratingValue']/text()").extract(),
'rating_count': response.xpath("//span[@itemprop='ratingCount']/text()").extract()
}
# Follow link to full cast list
for a in response.xpath("//div[@class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[@class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[@class='cast_list']"):
yield {
'actor': response.xpath("//td[@itemprop='actor']/a/span[@itemprop='name']/text()").extract(),
'character': response.xpath("//td[@class='character']/a/text()").extract()
}
解决方案
我添加了对您的代码的更改。此外,我向您展示了如何使用 Items 和 Pipelines。
蜘蛛/svu.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import EpisodeItem, CastItem
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
item = EpisodeItem(
season=response.xpath("//div[@class='bp_heading']/text()")[0].extract(),
episode=response.xpath("//div[@class='bp_heading']/text()")[1].extract(),
episode_name=response.xpath("//h1[@itemprop='name']/text()").extract_first().strip(),
date_published=response.xpath("//div[@class='subtext']/a/meta[@itemprop='datePublished']/@content").extract(),
rating_value=response.xpath("//span[@itemprop='ratingValue']/text()").extract(),
rating_count=response.xpath("//span[@itemprop='ratingCount']/text()").extract()
)
yield item
# Follow link to full cast list
for a in response.xpath("//div[@class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[@class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[@class='cast_list']"):
character = response.xpath("//td[@class='character']/a/text()").extract()
character.extend(response.xpath("//td[@class='character']/text()").extract())
character = [c.strip().replace('\n ', '') for c in character if c.strip()]
item = CastItem(
actor=response.xpath("//td[@itemprop='actor']/a/span[@itemprop='name']/text()").extract(),
character=character
)
yield item
项目.py
from scrapy import Item, Field
class EpisodeItem(Item):
season = Field()
episode = Field()
episode_name = Field()
date_published = Field()
rating_value = Field()
rating_count = Field()
class CastItem(Item):
actor = Field()
character = Field()
管道.py
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from .items import CastItem, EpisodeItem
class IMDBPipeline(object):
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
item_names = ['episode', 'cast']
self.files = self.files = {n: open('%s.csv' % n, 'w+b') for n in item_names}
self.exporters = {n: CsvItemExporter(f) for n, f in self.files.items()}
for exporter in self.exporters.values():
exporter.start_exporting()
def spider_closed(self, spider):
for exporter in self.exporters.values():
exporter.finish_exporting()
for file in self.files.values():
file.close()
def process_item(self, item, spider):
if isinstance(item, EpisodeItem):
self.exporters['episode'].export_item(item)
if isinstance(item, CastItem):
self.exporters['cast'].export_item(item)
return item
添加到设置文件:
ITEM_PIPELINES = {
'PROJECT_NAME.pipelines.IMDBPipeline': 300,
}
小心点。您需要将 PROJECT_NAME 替换为您的。
推荐阅读
- typescript - 类或子类的所有索引的类型
- javascript - react native 中的解析模块 fs 出错
- firebase - 在后端数据库中编码用户定义的排序顺序的明智方法?
- reactjs - TypeScript 没有忽略 node_modules 库
- docker - Travis 上的 Docker 构建失败 --> 找不到应用程序/构建
- flutter - 如何解决文本溢出?
- java - 如何在每个元素上一个接一个地设置动画而不是一起设置动画
- rest - REST API 添加条形码方法
- r - 在 Shiny 的 reactiveValues 中使用分配?
- reactjs - 通过反应上下文暴露的测试方法