首页 > 解决方案 > 如果 Scrapy Spider 的条件

问题描述

我正在尝试抓取IMDB 并获取有关 Steven Spielberg 或 Martin Scorsese 导演的电影的信息。问题是 HTML 代码中的“预算”行对于某些电影是 div[7] 而对于其他电影是 div[8]。我正在尝试编写一个函数,如果满足条件,它将获取数据(在这种情况下 - 如果 div 包含单词“budget”,则提取)下面是我的代码:

# Import the needed packages
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import numpy as np

### Scrape movies of Steven Spielberg ###
## IMDB_Spider1
# Create the Spider class1
class IMDB_Spider1(scrapy.Spider):
  name = "IMDB_spider1"
  # start_requests method
  def start_requests( self ):
    url = 'https://www.imdb.com/name/nm0000229/?ref_=nv_sr_srsg_0'
    yield scrapy.Request( url = url,
                         callback = self.parse_front )
  # First parsing method
  def parse_front(self, response):
    links_to_follow = response.xpath('//*[contains(@class,"filmo-row even")]/b/a/@href').extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
 # Second parsing method
  def parse_pages(self, response):
    title = response.css('h1::text').extract_first().strip()
    year = response.css('#titleYear a::text').extract_first()
    rating = response.css('.subtext::text').extract_first().strip() or None
    users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
    votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
    metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
    countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
    countries = [country.strip() for country in countries]
    languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
    languages = [language.strip() for language in languages]
    actors = response.xpath('//td[not(@class)]/a/text()').extract()
    actors = [actor.strip() for actor in actors]
    tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
    tagline = ''.join(tagline).strip() or None
    directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
    directors = [director.strip() for director in directors]
    runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
    genres =  response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
    budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
    opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract() 
    gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
    world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
    imdb_url = response.url.replace('?ref_=adv_li_tt', '')
    mv1_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url
    
## IMDB_Spider2
# Create the Spider class2
class IMDB_Spider2(scrapy.Spider):
  name = "IMDB_spider2"
  # start_requests method
  def start_requests( self ):
    url = 'https://www.imdb.com/name/nm0000229/?ref_=nv_sr_srsg_0'
    yield scrapy.Request( url = url,
                         callback = self.parse_front )
  # First parsing method
  def parse_front(self, response):
    links_to_follow = response.xpath('//*[contains(@class,"filmo-row odd")]/b/a/@href').extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
 # Second parsing method
  def parse_pages(self, response):
    title = response.css('h1::text').extract_first().strip()
    year = response.css('#titleYear a::text').extract_first()
    rating = response.css('.subtext::text').extract_first().strip() or None
    users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
    votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
    metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
    countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
    countries = [country.strip() for country in countries]
    languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
    languages = [language.strip() for language in languages]
    actors = response.xpath('//td[not(@class)]/a/text()').extract()
    actors = [actor.strip() for actor in actors]
    tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
    tagline = ''.join(tagline).strip() or None
    directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
    directors = [director.strip() for director in directors]
    runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
    genres =  response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
    budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
    opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract() 
    gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
    world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
    imdb_url = response.url.replace('?ref_=adv_li_tt', '')
    mv2_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url

### Scrape movies of Martin Scorsese ###
## IMDB_Spider3
# Create the Spider class3
class IMDB_Spider3(scrapy.Spider):
  name = "IMDB_spider3"
  # start_requests method
  def start_requests( self ):
    url = 'https://www.imdb.com/name/nm0000217/?ref_=nv_sr_srsg_0'
    yield scrapy.Request( url = url,
                         callback = self.parse_front )
  # First parsing method
  def parse_front(self, response):
    links_to_follow = response.xpath('//*[contains(@class,"filmo-row even")]/b/a/@href').extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
 # Second parsing method
  def parse_pages(self, response):
    title = response.css('h1::text').extract_first().strip()
    year = response.css('#titleYear a::text').extract_first()
    rating = response.css('.subtext::text').extract_first().strip() or None
    users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
    votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
    metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
    countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
    countries = [country.strip() for country in countries]
    languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
    languages = [language.strip() for language in languages]
    actors = response.xpath('//td[not(@class)]/a/text()').extract()
    actors = [actor.strip() for actor in actors]
    tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
    tagline = ''.join(tagline).strip() or None
    directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
    directors = [director.strip() for director in directors]
    runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
    genres =  response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
    budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
    opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract() 
    gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
    world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
    imdb_url = response.url.replace('?ref_=adv_li_tt', '')
    mv3_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url

## IMDB_Spider4
# Create the Spider class4
class IMDB_Spider4(scrapy.Spider):
  name = "IMDB_spider4"
  # start_requests method
  def start_requests( self ):
    url = 'https://www.imdb.com/name/nm0000217/?ref_=nv_sr_srsg_0'
    yield scrapy.Request( url = url,
                         callback = self.parse_front )
  # First parsing method
  def parse_front(self, response):
    links_to_follow = response.xpath('//*[contains(@class,"filmo-row odd")]/b/a/@href').extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
 # Second parsing method
  def parse_pages(self, response):
    title = response.css('h1::text').extract_first().strip()
    year = response.css('#titleYear a::text').extract_first()
    rating = response.css('.subtext::text').extract_first().strip() or None
    users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
    votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
    metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
    countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
    countries = [country.strip() for country in countries]
    languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
    languages = [language.strip() for language in languages]
    actors = response.xpath('//td[not(@class)]/a/text()').extract()
    actors = [actor.strip() for actor in actors]
    tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
    tagline = ''.join(tagline).strip() or None
    directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
    directors = [director.strip() for director in directors]
    runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
    genres =  response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
    budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
    opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract() 
    gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
    world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
    imdb_url = response.url.replace('?ref_=adv_li_tt', '')
    mv4_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url

标签: pythonscrapy

解决方案


尝试类似:

//*[@id='titleDetails']/div[contains(.,'Color')]

这将在找到所需属性时获取数据。


推荐阅读