首页 > 解决方案 > SCAPY 使用 Xpath 表达式抓取 imdb 网站

问题描述

在输出中将所有内容都设为 none 无法找出代码中的问题

在 imdb 上抓取前 1000 部评分最高的电影的详细信息

链接:- https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating

代码

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class BestMoviesSpider(CrawlSpider):
    name = 'best_movies'
    allowed_domains = ['imdb.com']
    start_urls = ['https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating']
    
    rules = (
        Rule(LinkExtractor(restrict_xpaths="//h3[@class='lister-item-header']/a "), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        yield{
            'title' : response.xpath("//h1[@class='TitleHeader__TitleText-sc-1wu6n3d-0 cLNRlG']/text()").get(),
            'year' : response.xpath("(//li/span[@class='TitleBlockMetaData__ListItemText-sc-12ein40-2 jedhex'])[1]/text()").get(),
            'duration' : response.xpath("(//li[@class='ipc-inline-list__item'])[3]/text()").get(),
            'rating' : response.xpath("(//span[@class='AggregateRatingButton__RatingScore-sc-1il8omz-1 fhMjqK'])[2]/text()").get(),
            'director' : response.xpath("(//a[@class='ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link'])[13]/text()").get(),
            'movie_url' : response.url
        }
        
        

标签: pythonscrapyimdb

解决方案


您项目的 xpath 选择几乎完全不正确,并且没有分页规则。这是完整的解决方案。您还将从我的解决方案中了解到如何使 CrawlSpider 分页。

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class BestMoviesSpider(CrawlSpider):
    name = 'best_movies'
    allowed_domains = ['imdb.com']
    start_urls = ['https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating']
    
    rules = (
        Rule(LinkExtractor(restrict_xpaths="//h3[@class='lister-item-header']/a "), callback='parse_item', follow=False),
        Rule(LinkExtractor(restrict_xpaths='(//*[@class="lister-page-next next-page"])[1]'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        yield{
            'title' : response.xpath('(//h1/text())[1]').get().strip(),
            'year' : response.xpath('//span[@id="titleYear"]/a/text()').get(),
            'duration' : response.xpath('normalize-space((//time/text())[1])').get(),
            'rating' : response.xpath('//*[@itemprop="ratingValue"]/text()').get(),
            'director' : response.xpath('(//*[@class="credit_summary_item"]/h4/following-sibling::a)[1]/text()').get(),
            'movie_url' : response.url
        }
        
 

推荐阅读