python - 美丽的汤,两个列表的交集(一个带有一对字符串)
问题描述
我正在使用 beautifulSoup 来抓取几个网址。
BeautifulSoup 返回一个结果列表,如下所示:
list1 = ['url1','keyword1', 'url2', 'keyword2'...]
(成对的关键字和它们来自的网址)
我想匹配list1
一个list2
参考关键字列表。
我的目标是,如果keyword_referenceX
fromlist2
也存在list1
,我会得到类似的结果['urlX', keyword_referenceX']
。
import requests
from bs4 import BeautifulSoup
urls = []
list1 = []
results = []
urls = ['url1','url2']
list2 = ['keyword_reference1', 'keyword_reference2']
for url in urls:
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
for sub_heading in soup.find_all('h3'):
if str(1) + ". " in sub_heading.text and "11." not in sub_heading.text: #filter I apply
list1.append(url)
list1.append(sub_heading.text)
for list2_element in list2:
for list1_element in list1:
if list2_element in list1_element:
results.append(list2_element)
print(results)
解决方案
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
urls = []
list1 = defaultdict(list)
results = []
urls = ['url1','url2']
list2 = ['keyword_reference1', 'keyword_reference2']
for url in urls:
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
for sub_heading in soup.find_all('h3'):
if str(1) + ". " in sub_heading.text and "11." not in sub_heading.text:
# list1 is actually a defaultdict
list1[url].append(sub_heading.text)
for list2_element in list2:
for k, v in list1.items():
# Here v is the list containing sub_heading.text and k is the url
if list2_element in v:
results.extend([url, list2_element])
print(results)