首页 > 解决方案 > URL 中有两个变量的 Beautiful Soup 脚本

问题描述

我正在尝试抓取这个网站。我需要一个脚本,它可以将名称放在 URL 的“who =”部分,以及我想在“page =”中抓取的页面编号。

这是当前脚本:

import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from colorama import Fore, Style

def main(url):
    names = ["Bryan", "David", "Robert"]
    with requests.Session() as req:
        data = []
    for name in names:
        for page in range(1, 9):
            print(url.format(Fore.RED + name + Style.RESET_ALL, Fore.YELLOW + str(page) + Style.RESET_ALL))
            print(f"Extracting Page# {page}")

            r = req.get(url.format(page))
            soup = bs(r.content, 'html.parser')

            names = [name.text for name in soup.select("h2.name.title.inbl")]
            address = [address.text for address in soup.select("div.h4.address.mtreset")]
            phone = [ph.group(1) for ph in re.finditer(r'mainLine":"(\d+)', r.text)]
            for x, y, z in zip(names, address, phone):
                if z.startswith(("06", "07")):
                    data.append([x, y, z])
                    print(z)

        df = pd.DataFrame(data, columns=["Name", "Address", "Phone"])
        print(df)
        df.to_csv(r'C:////////.csv', index=False)
        print("Data Saved to your csv")

main("https://www.#########.com/search?part=1&who={}&page={}")

请有人能解释一下问题是什么并最终纠正这个脚本吗?

非常感谢您提前

标签: pythonvariablesweb-scrapingbeautifulsoup

解决方案


您的错误是您在print语句中分配值而不是实际的 url 变量

print(url.format(Fore.RED + name + Style.RESET_ALL, Fore.YELLOW + str(page) + Style.RESET_ALL))

在这里你只分配一个值后使用 url

r = req.get(url.format(page))

这是两种不同的方法:

方法一

import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from colorama import Fore, Style

def main(url):
    names = ["Bryan", "David", "Robert"]
    with requests.Session() as req:
        data = []
        for name in names:
            for page in range(1, 9):
                url = url.format(Fore.RED + name + Style.RESET_ALL, Fore.YELLOW + str(page) + Style.RESET_ALL)
                print(url)
                print(f"Extracting Page# {page}")

                r = req.get(url)
                soup = bs(r.content, 'html.parser')

                names = [name.text for name in soup.select("h2.name.title.inbl")]
                address = [address.text for address in soup.select("div.h4.address.mtreset")]
                phone = [ph.group(1) for ph in re.finditer(r'mainLine":"(\d+)', r.text)]
                for x, y, z in zip(names, address, phone):
                    if z.startswith(("06", "07")):
                        data.append([x, y, z])
                        print(z)

            df = pd.DataFrame(data, columns=["Name", "Address", "Phone"])
            print(df)
            df.to_csv(r'C:////////.csv', index=False)
            print("Data Saved to your csv")

main("https://www.#########.com/search?part=1&who={}&page={}")

方法二

import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from colorama import Fore, Style

def main(url):
    names = ["Bryan", "David", "Robert"]
    with requests.Session() as req:
        data = []
        for name in names:
            for page in range(1, 9):

                url_parm = {
                            'part'  : 1
                            ,'who'   : Fore.RED + name + Style.RESET_ALL
                            ,'page' : Fore.YELLOW + str(page) + Style.RESET_ALL
                            }

                print(f"Extracting Page# {page}")

                r = req.get(url , params = url_parm)
                soup = bs(r.content, 'html.parser')

                names = [name.text for name in soup.select("h2.name.title.inbl")]
                address = [address.text for address in soup.select("div.h4.address.mtreset")]
                phone = [ph.group(1) for ph in re.finditer(r'mainLine":"(\d+)', r.text)]
                for x, y, z in zip(names, address, phone):
                    if z.startswith(("06", "07")):
                        data.append([x, y, z])
                        print(z)

            df = pd.DataFrame(data, columns=["Name", "Address", "Phone"])
            print(df)
            df.to_csv(r'C:////////.csv', index=False)
            print("Data Saved to your csv")

main("https://www.#########.com/search") 

推荐阅读