首页 > 解决方案 > Python值重塑问题

问题描述

我确实在使用scraper

https://github.com/PHMark/Web-Scraping

我根据自己的需要对其进行了修改,代码如下:

from bs4 import BeautifulSoup as bs
from selenium import webdriver
import urllib.request, urllib.error, urllib.parse
import re
import ssl
import pandas as pd
import numpy as np
import os

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
#chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--no-sandbox')
prefs = {'download.default_directory' : os.getcwd()}
chrome_options.add_experimental_option('prefs', prefs)

class SoupMaker():
    """
    A class that scrapes indeed's Job ads
    """
    def __init__(self, _url, _driver):
        self.base_url = "https://www.indeed.com"
        self.home_url = self.base_url + _url
        self.job_links = []
        self.driver = _driver
        self.job_datas = []
        self.job_table = []
        
    def read_page(self):        
        self.ctx = ssl.create_default_context()
        self.ctx.check_hostname = False
        self.ctx.verify_mode = ssl.CERT_NONE
        print("Parsing: ", self.home_url)
        self.url = urllib.request.urlopen(self.home_url,
                              context = self.ctx).read()
        _soup1 = bs(self.url, "html.parser")
        self.a_tags = _soup1('a')
        
    def get_job_url(self):
        for link in self.a_tags:
            link = link.get("href", None)
            if link != None:
                cmp_url = re.search("^/.+/.+/jobs/.+", link)
                rc_url = re.search("^/rc.+", link)
                if cmp_url or rc_url:
                    self.job_links.append(self.base_url + link.strip())
                    
    def get_job_info(self):
        for link in self.job_links:
            print("    Scraping: ", link)
            self.driver.get(link)
            self.driver.implicitly_wait(2750)
            _soup2 = bs(self.driver.page_source, "lxml")
            self.title = _soup2.find("title").get_text()
            self.job_descs = _soup2.find_all('div', 'jobsearch-JobComponent-description icl-u-xs-mt--md')
            self.job_origins = _soup2.find_all('div', 'jobsearch-JobMetadataFooter')
            
            self.job_title = re.findall("(.+) - .+ - .+", self.title)[0]
            self.job_location = re.findall(".+ - (.+) - .+", self.title)[0]
            self.description = ''
            for d in self.job_descs:
                self.description += d.get_text("|", strip = True) 
            self.origin = re.findall("^.+ ago", self.job_origins[0].get_text())[0]    
            self.job_datas.append(self.job_title)
            self.job_datas.append(self.job_location)
            self.job_datas.append(self.description)
            self.job_datas.append(self.origin)
            
        self.x = np.array(self.job_datas).reshape((10,4))
        df = pd.DataFrame(data=self.x, columns=['Job Title', 'Job Location',
                                    'Job Description', 'Job Origin'])
        return df
        
if __name__ == '__main__':
    n = int(input("Enter no. of pages to scrape: "))
    n = n*10
    file_name = input("Enter CSV filename: ")
    #    driver = webdriver.Chrome(r"C:\chromedriver\chromedriver.exe")
    #driver = webdriver.Chrome('/usr/local/bin/chromedrive') 
    driver = webdriver.Chrome('/usr/local/bin/chromedriver',chrome_options=chrome_options)  
    chrome_options=chrome_options
    writer = pd.ExcelWriter('{0}.xlsx'.format(file_name), engine='xlsxwriter')
    df = []
    
    for i in range(10, n+10, 10):
        #ext = "/jobs?q=&l=United+States&start={0}".format(i-10)
        ext = "/jobs?l=United+States&start={0}".format(i-10)
        if n == 10:
            #ext = "/jobs-in-United+States"
            ext ="/l-United+States-jobs.html"
        s = SoupMaker(ext, driver)
        s.read_page()
        s.get_job_url()
        df.append(s.get_job_info())
        
    result = pd.concat(df)
    result.to_excel(writer, index=False)
    writer.save()
    driver.close()

该脚本工作正常如果我尝试只抓取1 页,但如果我尝试抓取超过 10 页,则会出现以下错误:

回溯(最后一次调用):文件“file.py”,第 96 行,在(模块)中 df.append(s.get_job_info()) 文件“file.py”,第 71 行,在 get_job_info self.x = np.数组(self.job_datas).reshape((10,4))ValueError:无法将大小为0的数组重塑为形状(10,4)

如果页面的输入大于 100 或 50 它将给出以下错误:

回溯(最后一次调用):文件“file.py”,第 100 行,在 df.append(s.get_job_info()) 文件“file.py”,第 64 行,在 get_job_info self.job_title = re.findall(" (.+) - .+ - .+", self.title)[0] IndexError: 列表索引超出范围

如果有人能帮我解决这个问题,我将不胜感激!提前致谢!

标签: pythonpython-3.xseleniumbeautifulsoup

解决方案


只是看看这个,我认为问题在于它实际上并没有检索任何数据。如果在 'get_job_url' 方法中没有解析链接,'get_job_info' 中的循环将不会运行,并且数组将是零维的。这将导致重塑失败。

为了更好地了解正在发生的事情,请尝试使用调试器检查状态,或者仅使用打印来更好地了解正在发生的事情。可能是 10 个页面的 URL 不好,给出了 404 页面,没有任何链接。


推荐阅读