python - 使用 python Beautifulsoup 进行网页抓取。如何抓取 dd 和 dt 元素中的文本?
问题描述
我正在尝试从黄页中获取一些公司信息。到目前为止一切都很顺利。但我只是无法获取特定公司页面中 dd 和 dt 元素中的文本。你能这么好心并帮助我吗?每一个建议都非常感谢!谢谢。
这是我的代码:(我首先进入网站并获取搜索结果。然后,我获取各个公司网页的链接并解析其中的内容。问题是我无法获取存储在 dd 元素中的信息个别公司的页面。)
from bs4 import BeautifulSoup as soup
import urllib.request
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from random import randint
import time
import socket
out_filename = "expeditors_in_NJ.csv"
headers = "business_name, business_type, business_website, business_phone, general_info, business_history, service_and_products, work_hours \n"
f = open(out_filename, "w")
f.write(headers)
for i in range(0,50):
page_url = "https://www.yellowpages.com/search?search_terms=expeditors&geo_location_terms=NJ&page=" + str(i+1) + "&sort=&source=real-user"
req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
uClient = urlopen(req)
page_soup = soup(uClient, "html.parser")
uClient.close()
containers = page_soup.findAll("div", {"class", "v-card"})
for container in containers:
business_name = container.find("a", {"class":"business-name"}).span.text
link = str(container.find("a",{"class":"business-name"}).get('href'))
container_url = "https://www.yellowbook.com" + link
req_ = Request(container_url, headers={'User-Agent': 'Mozilla/5.0'})
uClient = urlopen(req_)
container_soup = soup(uClient, "html.parser")
uClient.close()
info_list = container_soup.findAll("dd")
try:
business_type = container.find("div",{"class":"categories"}).text
except:
business_type = str(None)
try:
years_in_business = str(container.find("div",{"class":"count"}).text)
except:
years_in_business = str(None)
try:
business_website = container.find("a",{"class":"track-visit-website"}).get('href')
except:
business_website = str(None)
try:
business_address = container.find("div",{"class":"street-address"}).text + " " + container.find("div",{"class":"locality"}).text
except:
business_address = str(None)
try:
business_phone = container.find("div",{"class":"phones phone primary"}).text
except:
business_phone = str(None)
try:
general_info = info_list[0].text
except:
general_info = str(None)
try:
work_hours = info_list[1].text
except:
work_hours = str(None)
print("business name: " + business_name + "\n")
print("business type: " + business_type + "\n")
print("years_in_business: " + years_in_business + "\n")
print("business_website: " + business_website + "\n")
print("business_address: " + business_address + "\n")
print("business_phone: " + business_phone + "\n")
print("general_info: " + general_info + "\n")
print("work_hours: " + work_hours + "\n")
f.write(business_name.replace(",", "|") + ", " +
business_type.replace(",", "|").replace("/", "|") + ", " +
years_in_business.replace(",", "|").replace("/", "|") + ", " +
business_website.replace(",", "|").replace("/", "|") + ", " +
business_address.replace(",", "|").replace("/", "|") + ", " +
business_phone.replace(",", "|").replace("/", "|") + ", " +
general_info.replace(",", "|").replace("/", "|") +
work_hours.replace(",", "|").replace("/", "|") +
"\n")
f.close()
如果您想大量修改代码或以完全不同的方式进行修改,请给出一些解释,以便我理解。我是编程新手。非常感谢。
解决方案
import httpx
import trio
from bs4 import BeautifulSoup
import csv
limit = trio.CapacityLimiter(6)
async def scrape(client, item, sender):
async with limit, sender:
r = await client.get(f'https://www.yellowpages.com{item[1]}')
soup = BeautifulSoup(r.text, 'lxml')
try:
bw = soup.select_one('.primary-btn')['href']
except (TypeError, AttributeError):
bw = None
try:
phone = soup.select_one('p.phone').text
except (TypeError, AttributeError):
phone = None
try:
biy = soup.select_one('.number').text
except AttributeError:
biy = None
result = [item[0], bw, biy, phone]
print(result)
await sender.send(result)
async def worker(client, num, sender, nurse):
async with limit, sender:
params = {
"search_terms": "expeditors",
"geo_location_terms": "NJ",
"page": num,
"sort": "",
"source": "real-user"
}
r = await client.get('https://www.yellowpages.com/search', params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(i.span.text, i['href'])
for i in soup.select('.business-name')]
for what in goal:
nurse.start_soon(scrape, client, what, sender.clone())
async def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
client.headers.update(headers)
sender, receiver = trio.open_memory_channel(0)
nurse.start_soon(rec, receiver)
async with sender:
for item in range(1, 2):
nurse.start_soon(worker, client, item, sender.clone(), nurse)
async def rec(receiver):
with open('result.csv', 'w', buffering=1, newline='') as f:
writer = csv.writer(f)
writer.writerow(['Business Name', 'Website',
'Years In Business', 'Phone'])
async with receiver:
async for value in receiver:
writer.writerow(value)
if __name__ == "__main__":
trio.run(main)
推荐阅读
- c# - 使用 C# 将数据插入 SQL Server
- tensorflow - 在 Keras/Tensorflow 中实现自定义 WARP 损失函数时出现错误:LookupError: No gradient defined for operation
- java - 如何为 Nim 游戏制作这个循环?
- javascript - 与 lowdb 左连接
- c# - 是否可以使用 Windows 图标选择器对话框?
- c# - 最大化时如何使win表单面板保持在相同的位置
- android - 为什么我的 Viewpager 不显示 CrimeFragment?
- php - 如何在项目中同时使用 web 和 api 身份验证器
- unit-testing - `#[test]` 是否暗示 `#[cfg(test)]`?
- c - 检查比实际病毒大的二进制文件中的病毒签名