首页 > 解决方案 > 仅爬取第一页并将详细内容保存为 Python 中的数据框

问题描述

我正在尝试循环页面,爬虫并保存此链接的详细内容:

根据此处的代码,我将代码修改为:

import pandas as pd
import requests
from bs4 import BeautifulSoup

BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"

def get_main_urls() -> list:
    start_url = f"{BASE_URL}/index.html"
    return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]


def get_follow_urls(urls: list, session: requests.Session()) -> iter:
    for url in urls[:1]:  # remove [:1] to scrape all the pages
        body = session.get(url).content
        s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
        yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]

updated_df = pd.DataFrame()

with requests.Session() as connection_session:  # reuse your connection!
    for follow_url in get_follow_urls(get_main_urls(), connection_session):
        key = follow_url.rsplit("/")[-1].replace(".html", "")
        # print(f"Fetching data for {key}...")
        dfs = pd.read_html(
            connection_session.get(follow_url).content.decode("utf-8"),
            flavor="bs4")
        # https://stackoverflow.com/questions/39710903/pd-read-html-imports-a-list-rather-than-a-dataframe
        for df in dfs:
            # df = dfs[0].T
            df = dfs[0].T.iloc[1:, :].copy()
            updated_df = updated_df.append(df)
            print(updated_df)
    
    cols = ['项目编号', '转让/出租标的名称', '转让方/出租方名称', '转让标的评估价/年租金评估价(元)', 
            '转让底价/年租金底价(元)', '受让方/承租方名称', '成交价/成交年租金(元)', '成交日期']
    updated_df.columns = cols
    updated_df.to_excel('./data.xlsx', index = False)

但它只成功爬取了第一页,我怎么能爬取所有页面并添加url列?谢谢。

标签: pandasdataframeweb-scrapingbeautifulsoupweb-crawler

解决方案


这是你要找的吗?这会处理所有 url 并将数据帧列表转储到单个 excel 文件中。

就是这样:

import pandas as pd
import requests
from bs4 import BeautifulSoup

BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
COLUMNS = [
    '项目编号', '转让/出租标的名称', '转让方/出租方名称',
    '转让标的评估价/年租金评估价(元)', '转让底价/年租金底价(元)',
    '受让方/承租方名称', '成交价/成交年租金(元)', '成交日期', 'URL'
]


def get_main_urls() -> list:
    start_url = f"{BASE_URL}/index.html"
    return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]


def get_follow_urls(urls: list, session: requests.Session()) -> iter:
    for url in urls:
        body = session.get(url).content
        s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
        yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]


def post_process(list_of_dataframes: list, source_url: str) -> pd.DataFrame():
    temp_df = list_of_dataframes[0]
    temp_df = temp_df.append(
        pd.Series(["URL", source_url], index=temp_df.columns),
        ignore_index=True,
    )
    return temp_df.T.iloc[1:, :].copy()


def dump_to_excel(post_processed_dfs: list):
    df = pd.concat(post_processed_dfs)
    df.columns = COLUMNS
    df.to_excel("scraped_data.xlsx", index=False)


processed_dfs = []
with requests.Session() as connection_session:  # reuse your connection!
    for follow_url in get_follow_urls(get_main_urls(), connection_session):
        key = follow_url.rsplit("/")[-1].replace(".html", "")
        print(f"Fetching data for {key}...")
        df_list = pd.read_html(
            connection_session.get(follow_url).content.decode("utf-8"),
            flavor="bs4",
        )
        processed_dfs.append(post_process(df_list, follow_url))
    dump_to_excel(processed_dfs)

输出:

在此处输入图像描述


推荐阅读