首页 > 解决方案 > BeautifulSoup find.all() 网页抓取返回空

问题描述

当试图抓取该网站的多个页面时,我没有得到任何内容作为回报。我通常会检查以确保我创建的所有列表长度相同,但所有列表都以len = 0.

我用过类似的代码来抓取其他网站,为什么这段代码不能正常工作?

我尝试过的一些解决方案,但没有达到我的目的:this answerrequests.Session()中建议的解决方案,如建议here。.json

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

from time import sleep
from random import randint
from googletrans import Translator

translator = Translator()

rg = []
ctr_n = []
ctr = []
yr = []
mn = []
sub = []
cst_n = []
cst = []
mag = []
pty_n = []
pty = []
can = []
pev1 = []
vot1 = []
vv1 = []
ivv1 = []
to1 = []
cv1 = []
cvs1 = []
pv1 = []
pvs1 = []
pev2 = []
vot2 = []
vv2 = []
ivv2 = []
to2 = []
cv2 = []
cvs2 =[]
pv2 = []
pvs2 = []
seat = []
no_info = []
manual = []

START_PAGE = 1
END_PAGE = 42


for page in range(START_PAGE, END_PAGE + 1):

    page = requests.get("https://sejmsenat2019.pkw.gov.pl/sejmsenat2019/en/wyniki/sejm/okr/" + str(page))

    page.encoding = page.apparent_encoding

    if not page:
        pass

    else:

        soup = BeautifulSoup(page.text, 'html.parser')
    
        tbody = soup.find_all('table', class_='table table-borderd table-striped table-hover dataTable no-footer clickable right2 right4')

        sleep(randint(2,10))
        
        for container in tbody:

            col1 = container.find_all('tr', {'data-id':'26079'})
            for info in col1:
                col_1 = info.find_all('td')
                for data in col_1:
                    party = data[0]
                    party_trans = translator.translate(party)
                    pty_n.append(party_trans)

                    pvotes = data[1]
                    pv1.append(pvotes)

                    pshare = data[2]
                    pvs1.append(pshare)

                    mandates = data[3]
                    seat.append(mandates)

            col2 = container.find_all('tr', {'data-id':'26075'})
            for info in col2:
                col_2 = info.find_all('td')
                for data in col_2:
                    party2 = data[0]
                    party_trans2 = translator.translate(party2)
                    pty_n.append(party_trans2)

                    pvotes2 = data[1]
                    pv1.append(pvotes2)

                    pshare2 = data[2]
                    pvs1.append(pshare2)

                    mandates2 = data[3]
                    seat.append(mandates2)

            col3 = container.find_all('tr', {'data-id':'26063'})
            for info in col3:
                col_3 = info.find_all('td')
                for data in col_3:
                    party3 = data[0].text
                    party_trans3 = translator.translate(party3)
                    pty_n.extend(party_trans3)

                    pvotes3 = data[1].text
                    pv1.extend(pvotes3)

                    pshare3 = data[2].text
                    pvs1.extend(pshare3)

                    mandates3 = data[3].text
                    seat.extend(mandates3)

            col4 = container.find_all('tr', {'data-id':'26091'})
            for info in col4:
                col_4 = info.find_all('td',recursive=True)
                for data in col_4:
                    party4 = data[0]
                    party_trans4 = translator.translate(party4)
                    pty_n.extend(party_trans4)

                    pvotes4 = data[1]
                    pv1.extend(pvotes4)

                    pshare4 = data[2]
                    pvs1.extend(pshare4)

                    mandates4 = data[3]
                    seat.extend(mandates4)

            col5 = container.find_all('tr', {'data-id':'26073'})
            for info in col5:
                col_5 = info.find_all('td')
                for data in col_5:
                    party5 = data[0]
                    party_trans5 = translator.translate(party5)
                    pty_n.extend(party_trans5)

                    pvotes5 = data[1]
                    pv1.extend(pvotes5)

                    pshare5 = data[2]
                    pvs1.extend(pshare5)

                    mandates5 = data[3]
                    seat.extend(mandates5)

            col6 = container.find_all('tr', {'data-id':'26080'})
            for info in col6:
                col_6 = info.find_all('td')
                for data in col_6:
                    party6 = data[0]
                    party_trans6 = translator.translate(party6)
                    pty_n.extend(party_trans6)

                    pvotes6 = data[1]
                    pv1.extend(pvotes6)

                    pshare6 = data[2]
                    pvs1.extend(pshare6)

                    mandates6 = data[3]
                    seat.extend(mandates6)
            
                
        #### TOTAL  VOTES ####
        tfoot = soup.find_all('tfoot')
        for data in tfoot:
            fvote = data.find_all('td')
            for info in fvote:
                votefinal = info.find(text=True).get_text()
                fvoteindiv = [votefinal]
                fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
                vot1.extend(fvotelist)
            
        #### CONSTITUENCY NAMES ####
        constit = soup.find_all('a', class_='btn btn-link last')
        for data in constit:
            names = data.get_text()
            names_clean = names.replace("Sejum Constituency no.","")
            names_clean2 = names_clean.replace("[","")
            names_clean3 = names_clean2.replace("]","")
            namesfinal = names_clean3.split()[1]
            constitindiv = [namesfinal]
            constitlist = constitindiv * (len(pty_n) - len(cst_n))
            cst_n.extend(constitlist)

        #### UNSCRAPABLE INFO ####
        region = 'Europe'
        reg2 = [region]
        reglist = reg2 * (len(pty_n) - len(rg))
        rg.extend(reglist)

        country = 'Poland'
        ctr2 = [country]
        ctrlist = ctr2 * (len(pty_n) - len(ctr_n))
        ctr_n.extend(ctrlist)

        year = '2019'
        yr2 = [year]
        yrlist = yr2 * (len(pty_n) - len(yr))
        yr.extend(yrlist)

        month = '10'
        mo2 = [month]
        molist = mo2 * (len(pty_n) - len(mn))
        mn.extend(molist)

        codes = ''
        codes2 = [codes]
        codeslist = codes2 * (len(pty_n) - len(manual))
        manual.extend(codeslist)

        noinfo = '-990'
        noinfo2 = [noinfo]
        noinfolist = noinfo2 * (len(pty_n) - len(no_info))
        no_info.extend(noinfolist)

        print(len(rg), len(pty_n), len(pv1), len(pvs1), len(no_info), len(vot1), len(cst_n))

    

poland19 = pd.DataFrame({
'rg' : rg,
'ctr_n' : ctr_n,
'ctr': manual,
'yr' : yr,
'mn' : mn,
'sub' : manual,
'cst_n': cst_n,
'cst' : manual,
'mag': manual,
'pty_n': pty_n,
'pty': manual,
'can': can,
'pev1': no_info,
'vot1': vot1,
'vv1': vot1,
'ivv1': no_info,
'to1': no_info,
'cv1': no_info,
'cvs1': no_info,
'pv1': cv1,
'pvs1': cvs1,
'pev2': no_info,
'vot2': no_info,
'vv2': no_info,
'ivv2': no_info,
'to2': no_info,
'cv2': no_info,
'cvs2': no_info,
'pv2' : no_info,
'pvs2' : no_info,
'seat' : manual
})

print(poland19)

poland19.to_csv('poland_19.csv')

标签: pythonpandasdataframeweb-scrapingbeautifulsoup

解决方案


如评论所述,您可能需要使用 Selenium。您可以替换请求库并将请求语句替换为如下所示:

from selenium import webdriver


wd = webdriver.Chrome('pathToChromeDriver') # or any other Browser driver
wd.get(url) # instead of requests.get()
soup = BeautifulSoup(wd.page_source, 'html.parser')

您需要按照以下链接的说明安装和实现 selenium 库:https ://selenium-python.readthedocs.io/

注意:我用 selenium 测试了你的代码,我能够得到你正在寻找的表,但是 class_=... 由于某种原因不起作用。而是浏览抓取的数据,我发现它有一个属性 id。所以也许也试试这个:

tbody = soup.find_all('table', id="DataTables_Table_0")

再次,通过使用 selenium 库执行 get 请求。希望这有帮助:)干杯


推荐阅读