python - URL 中有两个变量的 Beautiful Soup 脚本
问题描述
我正在尝试抓取这个网站。我需要一个脚本,它可以将名称放在 URL 的“who =”部分,以及我想在“page =”中抓取的页面编号。
这是当前脚本:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from colorama import Fore, Style
def main(url):
names = ["Bryan", "David", "Robert"]
with requests.Session() as req:
data = []
for name in names:
for page in range(1, 9):
print(url.format(Fore.RED + name + Style.RESET_ALL, Fore.YELLOW + str(page) + Style.RESET_ALL))
print(f"Extracting Page# {page}")
r = req.get(url.format(page))
soup = bs(r.content, 'html.parser')
names = [name.text for name in soup.select("h2.name.title.inbl")]
address = [address.text for address in soup.select("div.h4.address.mtreset")]
phone = [ph.group(1) for ph in re.finditer(r'mainLine":"(\d+)', r.text)]
for x, y, z in zip(names, address, phone):
if z.startswith(("06", "07")):
data.append([x, y, z])
print(z)
df = pd.DataFrame(data, columns=["Name", "Address", "Phone"])
print(df)
df.to_csv(r'C:////////.csv', index=False)
print("Data Saved to your csv")
main("https://www.#########.com/search?part=1&who={}&page={}")
请有人能解释一下问题是什么并最终纠正这个脚本吗?
非常感谢您提前
解决方案
您的错误是您在print
语句中分配值而不是实际的 url 变量
print(url.format(Fore.RED + name + Style.RESET_ALL, Fore.YELLOW + str(page) + Style.RESET_ALL))
在这里你只分配一个值后使用 url
r = req.get(url.format(page))
这是两种不同的方法:
方法一
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from colorama import Fore, Style
def main(url):
names = ["Bryan", "David", "Robert"]
with requests.Session() as req:
data = []
for name in names:
for page in range(1, 9):
url = url.format(Fore.RED + name + Style.RESET_ALL, Fore.YELLOW + str(page) + Style.RESET_ALL)
print(url)
print(f"Extracting Page# {page}")
r = req.get(url)
soup = bs(r.content, 'html.parser')
names = [name.text for name in soup.select("h2.name.title.inbl")]
address = [address.text for address in soup.select("div.h4.address.mtreset")]
phone = [ph.group(1) for ph in re.finditer(r'mainLine":"(\d+)', r.text)]
for x, y, z in zip(names, address, phone):
if z.startswith(("06", "07")):
data.append([x, y, z])
print(z)
df = pd.DataFrame(data, columns=["Name", "Address", "Phone"])
print(df)
df.to_csv(r'C:////////.csv', index=False)
print("Data Saved to your csv")
main("https://www.#########.com/search?part=1&who={}&page={}")
方法二
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from colorama import Fore, Style
def main(url):
names = ["Bryan", "David", "Robert"]
with requests.Session() as req:
data = []
for name in names:
for page in range(1, 9):
url_parm = {
'part' : 1
,'who' : Fore.RED + name + Style.RESET_ALL
,'page' : Fore.YELLOW + str(page) + Style.RESET_ALL
}
print(f"Extracting Page# {page}")
r = req.get(url , params = url_parm)
soup = bs(r.content, 'html.parser')
names = [name.text for name in soup.select("h2.name.title.inbl")]
address = [address.text for address in soup.select("div.h4.address.mtreset")]
phone = [ph.group(1) for ph in re.finditer(r'mainLine":"(\d+)', r.text)]
for x, y, z in zip(names, address, phone):
if z.startswith(("06", "07")):
data.append([x, y, z])
print(z)
df = pd.DataFrame(data, columns=["Name", "Address", "Phone"])
print(df)
df.to_csv(r'C:////////.csv', index=False)
print("Data Saved to your csv")
main("https://www.#########.com/search")
推荐阅读
- xamarin.forms - 为什么在使用 ReactiveContentPage 时会出现“不一致的可访问性”问题?
- java - 尝试从输入流(逐个字符)将单词解析为 HashMap,但是,空格不断出现?
- react-native - 如何从 React Native 中的不同组件调用方法
- android - IronSource 奖励视频加载
- c# - 我正在尝试让人工智能跟随一条路径,但它不起作用
- android - 有没有办法使用同一个lib的2个不同版本,所以android中的文件
- r - R中的数学约束优化
- php - PHP中window.location.origin的替代
- python - 如何循环遍历 BS4 数据并正确打印 div 标签
- java - 用 Kotlin 创建的房间数据库,用 Java 编写的活动,我可以这样做吗?