python - 如果 Scrapy Spider 的条件
问题描述
我正在尝试抓取IMDB 并获取有关 Steven Spielberg 或 Martin Scorsese 导演的电影的信息。问题是 HTML 代码中的“预算”行对于某些电影是 div[7] 而对于其他电影是 div[8]。我正在尝试编写一个函数,如果满足条件,它将获取数据(在这种情况下 - 如果 div 包含单词“budget”,则提取)下面是我的代码:
# Import the needed packages
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import numpy as np
### Scrape movies of Steven Spielberg ###
## IMDB_Spider1
# Create the Spider class1
class IMDB_Spider1(scrapy.Spider):
name = "IMDB_spider1"
# start_requests method
def start_requests( self ):
url = 'https://www.imdb.com/name/nm0000229/?ref_=nv_sr_srsg_0'
yield scrapy.Request( url = url,
callback = self.parse_front )
# First parsing method
def parse_front(self, response):
links_to_follow = response.xpath('//*[contains(@class,"filmo-row even")]/b/a/@href').extract()
for url in links_to_follow:
yield response.follow(url = url,
callback = self.parse_pages)
# Second parsing method
def parse_pages(self, response):
title = response.css('h1::text').extract_first().strip()
year = response.css('#titleYear a::text').extract_first()
rating = response.css('.subtext::text').extract_first().strip() or None
users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
countries = [country.strip() for country in countries]
languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
languages = [language.strip() for language in languages]
actors = response.xpath('//td[not(@class)]/a/text()').extract()
actors = [actor.strip() for actor in actors]
tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
tagline = ''.join(tagline).strip() or None
directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
directors = [director.strip() for director in directors]
runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
genres = response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract()
gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
imdb_url = response.url.replace('?ref_=adv_li_tt', '')
mv1_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url
## IMDB_Spider2
# Create the Spider class2
class IMDB_Spider2(scrapy.Spider):
name = "IMDB_spider2"
# start_requests method
def start_requests( self ):
url = 'https://www.imdb.com/name/nm0000229/?ref_=nv_sr_srsg_0'
yield scrapy.Request( url = url,
callback = self.parse_front )
# First parsing method
def parse_front(self, response):
links_to_follow = response.xpath('//*[contains(@class,"filmo-row odd")]/b/a/@href').extract()
for url in links_to_follow:
yield response.follow(url = url,
callback = self.parse_pages)
# Second parsing method
def parse_pages(self, response):
title = response.css('h1::text').extract_first().strip()
year = response.css('#titleYear a::text').extract_first()
rating = response.css('.subtext::text').extract_first().strip() or None
users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
countries = [country.strip() for country in countries]
languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
languages = [language.strip() for language in languages]
actors = response.xpath('//td[not(@class)]/a/text()').extract()
actors = [actor.strip() for actor in actors]
tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
tagline = ''.join(tagline).strip() or None
directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
directors = [director.strip() for director in directors]
runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
genres = response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract()
gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
imdb_url = response.url.replace('?ref_=adv_li_tt', '')
mv2_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url
### Scrape movies of Martin Scorsese ###
## IMDB_Spider3
# Create the Spider class3
class IMDB_Spider3(scrapy.Spider):
name = "IMDB_spider3"
# start_requests method
def start_requests( self ):
url = 'https://www.imdb.com/name/nm0000217/?ref_=nv_sr_srsg_0'
yield scrapy.Request( url = url,
callback = self.parse_front )
# First parsing method
def parse_front(self, response):
links_to_follow = response.xpath('//*[contains(@class,"filmo-row even")]/b/a/@href').extract()
for url in links_to_follow:
yield response.follow(url = url,
callback = self.parse_pages)
# Second parsing method
def parse_pages(self, response):
title = response.css('h1::text').extract_first().strip()
year = response.css('#titleYear a::text').extract_first()
rating = response.css('.subtext::text').extract_first().strip() or None
users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
countries = [country.strip() for country in countries]
languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
languages = [language.strip() for language in languages]
actors = response.xpath('//td[not(@class)]/a/text()').extract()
actors = [actor.strip() for actor in actors]
tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
tagline = ''.join(tagline).strip() or None
directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
directors = [director.strip() for director in directors]
runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
genres = response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract()
gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
imdb_url = response.url.replace('?ref_=adv_li_tt', '')
mv3_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url
## IMDB_Spider4
# Create the Spider class4
class IMDB_Spider4(scrapy.Spider):
name = "IMDB_spider4"
# start_requests method
def start_requests( self ):
url = 'https://www.imdb.com/name/nm0000217/?ref_=nv_sr_srsg_0'
yield scrapy.Request( url = url,
callback = self.parse_front )
# First parsing method
def parse_front(self, response):
links_to_follow = response.xpath('//*[contains(@class,"filmo-row odd")]/b/a/@href').extract()
for url in links_to_follow:
yield response.follow(url = url,
callback = self.parse_pages)
# Second parsing method
def parse_pages(self, response):
title = response.css('h1::text').extract_first().strip()
year = response.css('#titleYear a::text').extract_first()
rating = response.css('.subtext::text').extract_first().strip() or None
users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
countries = [country.strip() for country in countries]
languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
languages = [language.strip() for language in languages]
actors = response.xpath('//td[not(@class)]/a/text()').extract()
actors = [actor.strip() for actor in actors]
tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
tagline = ''.join(tagline).strip() or None
directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
directors = [director.strip() for director in directors]
runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
genres = response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract()
gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
imdb_url = response.url.replace('?ref_=adv_li_tt', '')
mv4_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url
解决方案
尝试类似:
//*[@id='titleDetails']/div[contains(.,'Color')]
这将在找到所需属性时获取数据。
推荐阅读
- java - 我在 Java 中遇到错误:无法解析为变量
- r - 如何用 R 连接数据框中的行?
- reactjs - 在更新查询期间,一些突变保持“待定”
- r - 为什么我的闪亮应用程序在使用自定义函数构建情节图时会发出警告:错误!:无效的参数类型错误?
- r - 如何在 Shiny 的 data.frame 中将列名设置为数学公式?
- java - Intellij 插件开发 - 如何创建和复制文件和文件夹
- python - 无法在 Mac 上的 PycharmCE 中导入 cv2
- javascript - node js - 不推荐使用 body-parser 未定义扩展:提供扩展选项
- reactjs - 输入标签不起作用(redux-form)
- c# - 在 FluentEmail 中的后续电子邮件中不断添加收件人