首页 > 解决方案 > 使用 BeautifulSoup 和 pandas 将索引与标题值匹配时刮掉标题下方的文本

问题描述

我能够将标题索引与下面代码中的标题文本索引进行匹配。我想不通的是当标题不在汤中时附加 np.NaN 。这是我之前的问题的后续。

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return(soup)

urls_test = ['https://www.example.com/',
            'https://www.example.com/']

engine = []
trans = []
color = []
interior = []

for url in urls_test:
    soup = getAndParseURL(url)
    ul   = soup.select('ul[class="list-inline lot-breakdown-list"] li', recursive=True)
    lis_e0 = []
    lis_e1 = []
    if ul:
        for li in ul:
            lis0 = []
            lis1 = []
            lis0.append(li.h5.contents[0])
            lis1.append(li.contents[1])
            lis_e0.extend(lis0) 
            lis_e1.extend(lis1) 
        try:        
            for i in range(min(len(lis_e1), len(lis_e0))):
                if 'Engine' in lis_e0[i]:
                    engine.append(lis_e1[i])   
        except:
            engine.append(np.NaN)
        try:
            for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
                if 'Trans' in x:
                    trans.append(lis_e1[i])  
        except:
            trans.append(np.NaN)
        try:
            for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
                if 'Color' in x:
                    color.append(lis_e1[i])  
        except:
            color.append(np.NaN)
        try:
            for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
                if 'Interior' in x:
                    interior.append(lis_e1[i])  
        except:
            interior.append(np.NaN)
    else:
        engine.append(np.NaN)
        trans.append(np.NaN)
        color.append(np.NaN)
        interior.append(np.NaN)

engine
trans
color
interior

print(str(len(engine)))
print(str(len(trans)))
print(str(len(color)))
print(str(len(interior)))

Out:
['383 CI']
['Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
1
1
2
2

在我正在寻找的输出下方('engine' 的 for 循环不同但应该工作相同)。长度必须与 URL 的数量相匹配,否则在抓取多个 URL 时列表索引将不对应于正确的 URL。感谢您抽出宝贵的时间!

['NaN', '383 CI']
['NaN', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
2
2
2
2

标签: pythonpandasweb-scrapingindexingbeautifulsoup

解决方案


使用 try..except 块。

import requests
from bs4 import BeautifulSoup

def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return(soup)

urls_test = ['https://www.example.com/',
            'https://www.example.com/']

engine = []
trans = []
color = []
interior = []

for url in urls_test:
    soup = getAndParseURL(url)

    try:
        soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine')
        engine.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine').next_element.next_element)
    except:
        engine.append("Nan")

    try:
        soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans')
        trans.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans').next_element.next_element)
    except:
        trans.append("Nan")

    try:
        soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color')
        color.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color').next_element.next_element)
    except:
        color.append("Nan")

    try:
        soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior')
        interior.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior').next_element.next_element)
    except:
        interior.append("Nan")

print(engine)
print(trans)
print(color)
print(interior)

输出

['Nan', '383 CI']
['Nan', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']

在 DataFrame 中加载。

df=pd.DataFrame({"Engine" : engine,"Trans" : trans,"Color" : color,"Interior":interior})
print(df)

输出

            Color  Engine Interior      Trans
0           Green     Nan    Black        Nan
1  Curious Yellow  383 CI    Black  Automatic

推荐阅读