首页 > 解决方案 > 使用 python Beautifulsoup 进行网页抓取。如何抓取 dd 和 dt 元素中的文本?

问题描述

我正在尝试从黄页中获取一些公司信息。到目前为止一切都很顺利。但我只是无法获取特定公司页面中 dd 和 dt 元素中的文本。你能这么好心并帮助我吗?每一个建议都非常感谢!谢谢。

这是我的代码:(我首先进入网站并获取搜索结果。然后,我获取各个公司网页的链接并解析其中的内容。问题是我无法获取存储在 dd 元素中的信息个别公司的页面。)

from bs4 import BeautifulSoup as soup
import urllib.request
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from random import randint
import time
import socket

out_filename = "expeditors_in_NJ.csv"
headers = "business_name, business_type, business_website, business_phone, general_info, business_history, service_and_products, work_hours   \n"
f = open(out_filename, "w")
f.write(headers)

for i in range(0,50):
    page_url = "https://www.yellowpages.com/search?search_terms=expeditors&geo_location_terms=NJ&page=" + str(i+1) + "&sort=&source=real-user"
    req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
    uClient = urlopen(req)
    page_soup = soup(uClient, "html.parser")
    uClient.close()
    containers = page_soup.findAll("div", {"class", "v-card"})
    for container in containers: 
        business_name = container.find("a", {"class":"business-name"}).span.text
        link = str(container.find("a",{"class":"business-name"}).get('href'))
        container_url = "https://www.yellowbook.com" + link
        req_ = Request(container_url, headers={'User-Agent': 'Mozilla/5.0'})
        uClient = urlopen(req_)
        container_soup = soup(uClient, "html.parser")
        uClient.close()
        info_list = container_soup.findAll("dd")
        
        try:
            business_type = container.find("div",{"class":"categories"}).text
        except:
            business_type = str(None)
        try: 
            years_in_business = str(container.find("div",{"class":"count"}).text)
        except: 
            years_in_business = str(None)
        try: 
            business_website = container.find("a",{"class":"track-visit-website"}).get('href')
        except: 
            business_website = str(None)
        try:
            business_address = container.find("div",{"class":"street-address"}).text + " " + container.find("div",{"class":"locality"}).text
        except:
            business_address = str(None)
        try: 
            business_phone = container.find("div",{"class":"phones phone primary"}).text
        except: 
            business_phone = str(None)
        try:
            general_info = info_list[0].text
        except:
            general_info = str(None)
        try:
            work_hours = info_list[1].text
        except:
            work_hours = str(None)
        
        print("business name: " + business_name + "\n")
        print("business type: " + business_type + "\n")
        print("years_in_business: " + years_in_business + "\n")
        print("business_website: " + business_website + "\n")
        print("business_address: " + business_address + "\n")
        print("business_phone: " + business_phone + "\n")
        print("general_info: " + general_info + "\n")
        print("work_hours: " + work_hours + "\n")

        
        f.write(business_name.replace(",", "|") + ", " + 
                business_type.replace(",", "|").replace("/", "|") + ", " + 
                years_in_business.replace(",", "|").replace("/", "|") + ", " + 
                business_website.replace(",", "|").replace("/", "|") + ", " + 
                business_address.replace(",", "|").replace("/", "|") + ", " + 
                business_phone.replace(",", "|").replace("/", "|") + ", " + 
                general_info.replace(",", "|").replace("/", "|") +
                work_hours.replace(",", "|").replace("/", "|") +
                "\n")

f.close()

如果您想大量修改代码或以完全不同的方式进行修改,请给出一些解释,以便我理解。我是编程新手。非常感谢。

标签: pythonweb-scrapingbeautifulsoup

解决方案


import httpx
import trio
from bs4 import BeautifulSoup
import csv

limit = trio.CapacityLimiter(6)


async def scrape(client, item, sender):
    async with limit, sender:
        r = await client.get(f'https://www.yellowpages.com{item[1]}')
        soup = BeautifulSoup(r.text, 'lxml')

        try:
            bw = soup.select_one('.primary-btn')['href']
        except (TypeError, AttributeError):
            bw = None
        try:
            phone = soup.select_one('p.phone').text
        except (TypeError, AttributeError):
            phone = None
        try:
            biy = soup.select_one('.number').text
        except AttributeError:
            biy = None

        result = [item[0], bw, biy, phone]
        print(result)
        await sender.send(result)


async def worker(client, num, sender, nurse):
    async with limit, sender:
        params = {
            "search_terms": "expeditors",
            "geo_location_terms": "NJ",
            "page": num,
            "sort": "",
            "source": "real-user"
        }
        r = await client.get('https://www.yellowpages.com/search', params=params)
        soup = BeautifulSoup(r.text, 'lxml')
        goal = [(i.span.text, i['href'])
                for i in soup.select('.business-name')]
        for what in goal:
            nurse.start_soon(scrape, client, what, sender.clone())


async def main():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
    }
    async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
        client.headers.update(headers)

        sender, receiver = trio.open_memory_channel(0)
        nurse.start_soon(rec, receiver)

        async with sender:
            for item in range(1, 2):
                nurse.start_soon(worker, client, item, sender.clone(), nurse)


async def rec(receiver):
    with open('result.csv', 'w', buffering=1, newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Business Name', 'Website',
                        'Years In Business', 'Phone'])
        async with receiver:
            async for value in receiver:
                writer.writerow(value)

if __name__ == "__main__":
    trio.run(main)

推荐阅读