python - 使用 BeautifulSoup 和 pandas 将索引与标题值匹配时刮掉标题下方的文本
问题描述
我能够将标题索引与下面代码中的标题文本索引进行匹配。我想不通的是当标题不在汤中时附加 np.NaN 。这是我之前的问题的后续。
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
urls_test = ['https://www.example.com/',
'https://www.example.com/']
engine = []
trans = []
color = []
interior = []
for url in urls_test:
soup = getAndParseURL(url)
ul = soup.select('ul[class="list-inline lot-breakdown-list"] li', recursive=True)
lis_e0 = []
lis_e1 = []
if ul:
for li in ul:
lis0 = []
lis1 = []
lis0.append(li.h5.contents[0])
lis1.append(li.contents[1])
lis_e0.extend(lis0)
lis_e1.extend(lis1)
try:
for i in range(min(len(lis_e1), len(lis_e0))):
if 'Engine' in lis_e0[i]:
engine.append(lis_e1[i])
except:
engine.append(np.NaN)
try:
for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
if 'Trans' in x:
trans.append(lis_e1[i])
except:
trans.append(np.NaN)
try:
for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
if 'Color' in x:
color.append(lis_e1[i])
except:
color.append(np.NaN)
try:
for i, (x, y) in enumerate(zip(lis_e0, lis_e1)):
if 'Interior' in x:
interior.append(lis_e1[i])
except:
interior.append(np.NaN)
else:
engine.append(np.NaN)
trans.append(np.NaN)
color.append(np.NaN)
interior.append(np.NaN)
engine
trans
color
interior
print(str(len(engine)))
print(str(len(trans)))
print(str(len(color)))
print(str(len(interior)))
Out:
['383 CI']
['Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
1
1
2
2
在我正在寻找的输出下方('engine' 的 for 循环不同但应该工作相同)。长度必须与 URL 的数量相匹配,否则在抓取多个 URL 时列表索引将不对应于正确的 URL。感谢您抽出宝贵的时间!
['NaN', '383 CI']
['NaN', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
2
2
2
2
解决方案
使用 try..except 块。
import requests
from bs4 import BeautifulSoup
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
urls_test = ['https://www.example.com/',
'https://www.example.com/']
engine = []
trans = []
color = []
interior = []
for url in urls_test:
soup = getAndParseURL(url)
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine')
engine.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Engine').next_element.next_element)
except:
engine.append("Nan")
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans')
trans.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5', text='Trans').next_element.next_element)
except:
trans.append("Nan")
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color')
color.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Color').next_element.next_element)
except:
color.append("Nan")
try:
soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior')
interior.append(soup.select_one('ul[class="list-inline lot-breakdown-list"]>li').find_next('h5',text='Interior').next_element.next_element)
except:
interior.append("Nan")
print(engine)
print(trans)
print(color)
print(interior)
输出:
['Nan', '383 CI']
['Nan', 'Automatic']
['Green', 'Curious Yellow']
['Black', 'Black']
在 DataFrame 中加载。
df=pd.DataFrame({"Engine" : engine,"Trans" : trans,"Color" : color,"Interior":interior})
print(df)
输出:
Color Engine Interior Trans
0 Green Nan Black Nan
1 Curious Yellow 383 CI Black Automatic
推荐阅读
- docker - 为什么同一个docker镜像在不同的环境下会消耗不同的内存
- php - 库无法读取 Codeigniter 中的帖子输入
- kubernetes - Kubernetes如何在私网中容易受到网络攻击
- python - 在 python pandas 数据框中聚合行
- python - 是否需要神经网络(keras)才能使用分类器(sklearn)
- r - 无法为签名“xml_document”找到函数“readHTMLTable”的继承方法
- google-analytics - 可以解码谷歌分析的fbclid url,用户来自哪里?
- spring-batch - 使用 spring-cloud-deployer-cloudfoundry 启动分区的 spring 批处理作业
- sql - 如何使用条件返回多行计数?
- sql - 如何将本地服务器上运行的数据库迁移到另一台机器上