首页 > 解决方案 > Scrapy spider 无法正确迭代并存在 If 语句问题

问题描述

我正在尝试使用 Scrapy 从表中抓取申请人数据。我有两个问题:

1)我想要每行每个申请人的CSV:

'username': ['clickclack123'],'lsat':['170'],'gpa':['3.57']... 

我的代码当前在一行中提取所有申请人数据,忽略空值,并针对页面上的申请人数重复提取(100 行相同,每行包含页面上的所有数据):

'username': ['clickclack123','UM2014','3litersaday'...

2) 该表包含一类元素(“能指”),表明申请人的特征。我想包含一个 If 语句来检查能指并将每个特征保存为 True 如果适用。我在 lawschool.py(下)中包含了一个带有这种逻辑的 If 语句,但它不允许我的蜘蛛运行。

我的想法和尝试:

法学院.py

import scrapy
from ..items import ApplicantItem

class LawschoolSpider(scrapy.Spider):
    name = "lawschool"
    start_urls = [
        'http://nyu.lawschoolnumbers.com/applicants',
        'http://columbia.lawschoolnumbers.com/applicants'
    ]

    def parse(self, response):
        items = []
        for applicant in response.xpath("//tr[@class='row']"):
            signifier = response.xpath("//span[@class='signifier']/text()").extract()
            if signifier == 'W':
                withdrawn = True
            elif signifier == 'A':
                accepted == True
            elif signifier == 'U':
                minority == True
            elif signifier == 'N':
                non_traditional == True
            elif signifier == 'I':
                international = True
            else:
                return False
            school = response.xpath("//h1/text()").extract()
            school = [i.replace(' Applicants','') for i in school]
            item = ApplicantItem(
                school = school,
                username = response.xpath("//td/a/text()").extract(),
                lsat = response.xpath("//td[contains(@style, 'font-weight:bold')]/following-sibling::td[1]/text()").extract(),
                gpa = response.xpath("//td[contains(@style, 'font-weight:bold')]/following-sibling::td[2]/text()").extract(),
                scholarship = response.xpath("//td[contains(@style, 'font-weight:bold')]/following-sibling::td[4]/text()").extract(),
                status = response.xpath("//td[contains(@style, 'font-weight:bold')]/following-sibling::td[5]/text()").extract(),
                sent = response.xpath("//td[contains(@style, 'font-weight:bold')]/following-sibling::td[6]/text()").extract(),
                complete = response.xpath("//td[contains(@style, 'font-weight:bold')]/following-sibling::td[7]/text()").extract(),
                decision = response.xpath("//td[contains(@style, 'font-weight:bold')]/following-sibling::td[8]/text()").extract(),
                last_updated = response.xpath("//td[contains(@style, 'font-weight:bold')]/following-sibling::td[9]/text()").extract()
                withdrawn_application = withdrawn,
                accepted_offer = accepted,
                minority = minority,
                non_traditional = non_traditional,
                international = international
            )
            yield item

        for a in response.xpath("//*[@id='applicants_list']/div/a[9]"):
            yield response.follow(a, callback=self.parse)

项目.py

from scrapy import Item, Field


class ApplicantItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    school = Field()
    username = Field()
    lsat = Field()
    gpa = Field()
    scholarship = Field()
    status = Field()
    sent = Field()
    complete = Field()
    decision = Field()
    last_updated = Field()
    withdrawn_application = Field()
    accepted_offer = Field()
    minority = Field()
    non_traditional = Field()
    international = Field()

管道.py

from scrapy import signals
from scrapy.exporters import CsvItemExporter

from .items import ApplicantItem

class LSNPipeline(object):

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        item_names = ['applicant']
        self.files = self.files = {n: open('%s.csv' % n, 'w+b') for n in item_names}
        self.exporters = {n: CsvItemExporter(f) for n, f in self.files.items()}
        for exporter in self.exporters.values():
            exporter.start_exporting()

    def spider_closed(self, spider):
        for exporter in self.exporters.values():
            exporter.finish_exporting()

        for file in self.files.values():
            file.close()

    def process_item(self, item, spider):
        if isinstance(item, ApplicantItem):
            self.exporters['applicant'].export_item(item)

        return item

标签: pythonfor-loopscrapy

解决方案


您需要相对XPath 表达式:

username = applicant.xpath(".//td/a/text()").extract(),
lsat = applicant.xpath(".//td[2]/text()").extract(),
gpa = applicant.xpath(".//td[3]/text()").extract(),
...

推荐阅读