首页 > 解决方案 > beautifulsoup for loop 仅提取第一页数据

问题描述

我有一个 txt 文件,里面有 2 个 url

https://www.kununu.com/de/volkswagen/kommentare
https://www.kununu.com/de/audi/kommentare

我想用beautifulsoup从那个url中的所有页面中提取一些数据。下面的代码提取该数据,但仅用于第一页。我应该遗漏一些东西,你能更新代码吗,它将从所有页面中提取?

firma = []

lineList2 = [line.rstrip('\n') for line in open(r"C:/myfolder/555.txt")]

print(lineList2)

for url in lineList2:
    with requests.Session() as session:
        session.headers = {
            'x-requested-with': 'XMLHttpRequest'
        }
        page = 1
        while True:
            print(f"Processing page {page}..")
            url = f'{url}/{page}'
            response = session.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            articles = soup.find_all('article')
            print("Number of articles: " + str(len(articles)))
            for article in articles:

                try:
                    firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
                    firma.append(firmaText)
                except:
                    firma.append('N/A')

            page += 1

            pagination = soup.find_all('div', {'class': 'paginationControl'})
            if not pagination:
                break

    df = pd.DataFrame({

        'Company': firma
    })
print(df)

标签: pythonfor-loopbeautifulsoup

解决方案


from bs4 import BeautifulSoup
import requests
import pandas as pd

firma = []

lineList2=[]
with open('555.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        lineList2.append(line.strip('\n'))

print(lineList2)

for lurl in lineList2:
    with requests.Session() as session:
        session.headers = {
            'x-requested-with': 'XMLHttpRequest'
        }
        page = 1
        while True:
            print("in while")
            print(f"Processing page {page}..")
            url = f'{lurl}/{page}'
            print(url)
            response = session.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            articles = soup.find_all('article')
            print("Number of articles: " + str(len(articles)))
            for article in articles:

                try:
                    firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
                    firma.append(firmaText)
                except:
                    firma.append('N/A')

            page += 1

            pagination = soup.find_all('div', {'class': 'paginationControl'})
            if not pagination:
                break

    df = pd.DataFrame({

        'Company': firma
    })
    print(df)

推荐阅读