首页 > 解决方案 > 仅从第一页提取的数据保存在 csv 文件中(Scrapy、Selenium)

问题描述

我正在抓取一个网站,有几个页面(通过下一个按钮(Selenium)获得),每个页面都包含 20 个工作机会。使用回调函数,我可以从每个报价中获取详细信息。

问题:在 csv 输出中,最多只能保存第一页中的 20 个作业。该代码打开 chromebrowser 并正确地从一个页面跳转到另一个页面,但没有提取更多数据。

似乎没有其他人遇到过这个问题,但我不知道在这里做什么。有什么建议吗?

终端:

在此处输入图像描述

编码:

import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from time import sleep


class GetdataSpider(scrapy.Spider):
    name = 'getdata'
    allowed_domains = ['workpool-jobs.ch']
    start_urls = ['https://www.workpool-jobs.ch/recht-jobs']

def parse(self, response):
    url = 'https://www.workpool-jobs.ch/recht-jobs'
    self.driver = webdriver.Chrome('/Users/xxx/chromedriver')
    self.driver.maximize_window() # For maximizing window
    self.driver.implicitly_wait(10) # gives an implicit wait for 20 seconds
    self.driver.get(url)

    while True:

        sleep(3)
        sel = Selector(text=self.driver.page_source)
        single_joboffer = sel.xpath(".//p[@class='inserattitel h2 mt-0']/a/@href")
        for joboffer in single_joboffer:
            url1 = response.urljoin(joboffer.extract())
            yield scrapy.Request(url1, callback = self.parse_dir_contents)

        element = self.driver.find_element_by_css_selector("body > div.container-fluid.main-container.bg-white.py-5 > section.maincontent.row > div > nav:nth-child(11) > ul > li:nth-last-child(2) > a")
        self.driver.execute_script("window.scrollBy(0,4000)","", element)
        sel = Selector(text=self.driver.page_source)
        sleep(3)
        self.driver.find_element_by_css_selector("body > div.container-fluid.main-container.bg-white.py-5 > section.maincontent.row > div > nav:nth-child(11) > ul > li:nth-last-child(2) > a").click()
    
    self.driver.close()

def parse_dir_contents(self, response):
    single_info = response.xpath(".//*[@class='col-12 col-md mr-md-3 mr-xl-5']")

    for info in single_info:
        info_Titel = info.xpath(".//article/h1[@class='inserattitel']/text()").extract_first()
        info_Berufsfelder = info.xpath(".//article/div[@class='border-top-grau']/p/text()").extract()
        info_Arbeitspensum = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[1]/text()").extract_first()
        info_Anstellungsverhältnis = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[2]/text()").extract_first()
        info_Arbeitsort = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[4]/a/text()").extract()
        info_VerfügbarAb = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[5]/text()").extract()
        info_Kompetenzenqualifikation = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-7']/dl[2]/dd/text()").extract_first()
        info_Aufgabengebiet = info.xpath(".//article/div[@class='border-bottom-grau'][1]//*[self::p or self::li]").extract()
        info_Erwartungen = info.xpath(".//article/div[@class='border-bottom-grau'][2]/ul/li[descendant-or-self::text()]").extract()
        info_WirBietenIhnen = info.xpath(".//article/div[@class='border-bottom-grau'][3]/ul/li[descendant-or-self::text()]").extract()
        info_Publikationsdatum = info.xpath(".//article/footer[@class='inseratfooter']/p[1]/strong/text()").extract_first()

        yield {'Titel': info_Titel,
        'Berufsfelder': info_Berufsfelder,
        'Arbeitspensum': info_Arbeitspensum,
        'Anstellungsverhältnis': info_Anstellungsverhältnis,
        'Arbeitsort': info_Arbeitsort,
        'VerfügbarAb': info_VerfügbarAb,
        'Kompetenzenqualifikation': info_Kompetenzenqualifikation,
        'Aufgabengebiet': info_Aufgabengebiet,
        'Erwartungen': info_Erwartungen,
        'WirBietenIhnen': info_WirBietenIhnen,
        'Publikationsdatum': info_Publikationsdatum}

标签: pythonseleniumscrapysaveextract

解决方案


尝试更换

url1 = response.urljoin(joboffer.extract())

url1 = response.urljoin(joboffer.get())

提取获得所有这些*


推荐阅读