python - 为每个列表执行一个函数,在列表中比 for 循环更快 [网络抓取]
问题描述
我正在构建一个抓取脚本。我有一个包含客户详细信息的 .csv 文件 - 每行一个客户,每行中的第一个单元格是客户姓名。我想抓取 Google 新闻并为每个客户名称获取前 5 个结果。
现在我有一个 for 循环,所以它迭代每一行,获取客户的姓名,运行抓取功能并打印结果。缺点是速度很慢,因为它必须在移动到下一条线之前完成一条线。
我想知道是否存在更快的东西,我想从 csv 文件中读取行并有一个列表列表;然后为所有列表并行运行抓取功能。
import bs4,requests
import csv
class Scraper():
def __init__(self):
pass
def ScrapeWebStr(self,account):
base_url = 'https://news.google.com/search?q={}%20when%3A3d&hl=en-US&gl=US&ceid=US%3Aen'
request = requests.get(base_url.format(account))
webcontent = bs4.BeautifulSoup(request.content,'lxml')
counter = 0
global articles_str
articles_str = ""
for i in webcontent.findAll('div',{'jslog':'93789'}): #All the news in GN have jslog 93789. This iterate in all the news
if counter ==5:
break
else:
for link in i.findAll('a', attrs={'href': re.compile("/articles/")},limit=1): #takes the article link for each news
if any(keyword in i.select_one('h3').getText() for keyword in keyword_list): #filter articles that have a keyword
articles_str = articles_str + str(i.select_one('h3').getText()) + '\n' + "https://news.google.com"+str(link.get('href')) + '\n' + ('-' * 80) + '\n'
counter +=1
if counter == 5:
break
def fileOpen(self,file):
data = open(file,encoding='utf-8')
csv_rows = list(csv.reader(data))
return csv_rows
#FILTERS
keyword_list = ['ACQUISITION', 'Acquisition', 'BALANCE', 'BAN', 'BOND', 'BRAND', 'Balance', 'Ban', 'Bond', 'Brand', 'CAPITAL', 'COSTS', 'CRISIS', 'CUSTOMERS', 'Capital', 'Costs', 'Crisis', 'Customers', 'DEBT', 'DEMAND', 'Debt', 'Demand', 'ECONOMY', 'Economy', 'FINANCE', 'FINANCIAL', 'FUND', 'Finance', 'Financial', 'Fund', 'GROWTH', 'Growth', 'INVESTOR', 'IPO', 'IPO', 'Investor', 'LAYOFF', 'Layoff', 'MARKET', 'MERGER', 'Market', 'Merger', 'NEW', 'New', 'PAY', 'PAYABLES', 'PROFIT', 'Pay', 'Payables', 'Profit', 'RATING', 'RECEIVABLES', 'REVENUES', 'Rating', 'Receivables', 'Revenues', 'SALES', 'SHARE', 'SHEET', 'SIZE', 'STOCK', 'SUE', 'Sales', 'Share', 'Sheet', 'Size', 'Stock', 'Sue', 'TREND', 'Trend', 'USAGE', 'Usage', 'acquisition', 'balance', 'ban', 'bond', 'brand', 'capital', 'costs', 'crisis', 'customers', 'debt', 'demand', 'economy', 'finance', 'financial', 'fund', 'growth', 'investor', 'ipo', 'layoff', 'market', 'merger', 'new', 'pay', 'payables', 'profit', 'rating', 'receivables', 'revenues', 'sales', 'share', 'sheet', 'size', 'stock', 'sue', 'trend', 'usage'] #List of keywords to filter
#UPDATE + MAIL VERSION
Execution = Scraper()
metadata = Execution.fileOpen("Excel for Scraping.csv")
for row in metadata:
Execution.ScrapeWebStr(row[0])
print(articles_str)
for 循环减慢了一切,我有闲置的计算能力。我觉得解决方案可能包括多处理,但我不明白如何为主列表中的每个列表获取项目 [0] 并在没有 for 循环的情况下运行多处理。我不希望您为我编写代码,但任何提示都会有很大帮助!
非常感谢
解决方案
我使用httpx
并asyncio
同时发出请求,fake_useragent
伪造用户代理,并在代码中简化了 CSS 选择器。Repl.it 上有一个完整的示例。
请注意,我已经评论Execution.fileOpen
并使用了搜索词列表。
from bs4 import BeautifulSoup
import csv, time, asyncio, httpx
from fake_useragent import UserAgent
class Scraper():
def __init__(self):
pass
async def ScrapeWebStr(self, account: str) -> str:
async with httpx.AsyncClient() as client:
ua = UserAgent()
headers = {"User-Agent": ua.random}
params = {
"q": f"{account} when",
"hl": "en-US",
"gl": "US",
"ceid": "US:en"
}
response = await client.get(
'https://news.google.com/search',
params=params,
headers=headers)
webcontent = BeautifulSoup(response.text, 'lxml')
counter = 0
articles_str = ""
# All the news in GN have jslog 93789. This iterate in all the news
for i in webcontent.select('div[jslog="93789"]'):
if counter == 5:
break
else:
for link in i.select('h3 a[href*="/articles/"]'):
article_text = link.getText()
# filter articles that have a keyword
if any(keyword in article_text
for keyword in KEYWORD_LIST):
articles_str += f"""{article_text}
https://news.google.com{link.get('href')}
{'-' * 80}
"""
counter += 1
if counter == 5:
break
return articles_str
def fileOpen(self, file):
with open(file, encoding='utf-8') as data:
list(csv.reader(data))
# List of keywords to filter
KEYWORD_LIST = [
'ACQUISITION', 'Acquisition', 'BALANCE', 'BAN', 'BOND', 'BRAND', 'Balance',
'Ban', 'Bond', 'Brand', 'CAPITAL', 'COSTS', 'CRISIS', 'CUSTOMERS',
'Capital', 'Costs', 'Crisis', 'Customers', 'DEBT', 'DEMAND', 'Debt',
'Demand', 'ECONOMY', 'Economy', 'FINANCE', 'FINANCIAL', 'FUND', 'Finance',
'Financial', 'Fund', 'GROWTH', 'Growth', 'INVESTOR', 'IPO', 'IPO',
'Investor', 'LAYOFF', 'Layoff', 'MARKET', 'MERGER', 'Market', 'Merger',
'NEW', 'New', 'PAY', 'PAYABLES', 'PROFIT', 'Pay', 'Payables', 'Profit',
'RATING', 'RECEIVABLES', 'REVENUES', 'Rating', 'Receivables', 'Revenues',
'SALES', 'SHARE', 'SHEET', 'SIZE', 'STOCK', 'SUE', 'Sales', 'Share',
'Sheet', 'Size', 'Stock', 'Sue', 'TREND', 'Trend', 'USAGE', 'Usage',
'acquisition', 'balance', 'ban', 'bond', 'brand', 'capital', 'costs',
'crisis', 'customers', 'debt', 'demand', 'economy', 'finance', 'financial',
'fund', 'growth', 'investor', 'ipo', 'layoff', 'market', 'merger', 'new',
'pay', 'payables', 'profit', 'rating', 'receivables', 'revenues', 'sales',
'share', 'sheet', 'size', 'stock', 'sue', 'trend', 'usage'
]
async def main():
start_time = time.monotonic()
print(f"Started main")
async def scrape_and_print(row):
start_time = time.monotonic()
print(f"Searching for '{row}'")
# Execution.ScrapeWebStr(row[0])
articles_str = await Execution.ScrapeWebStr(row)
print(
f"Finished searching for '{row}' in {time.monotonic() - start_time}"
)
print(articles_str)
# UPDATE + MAIL VERSION
Execution = Scraper()
# metadata = Execution.fileOpen("Excel for Scraping.csv")
metadata = ['stackoverflow', 'google']
await asyncio.gather(
*[scrape_and_print(row) for row in metadata], return_exceptions=True)
print(f"Finished main in {time.monotonic() - start_time}")
asyncio.run(main())
输出
尾随点在news.google.com.
. FQDN 末尾有点(来源:1、2、3)。
Started main
Searching for 'stackoverflow'
Searching for 'google'
Finished searching for 'stackoverflow' in 5.42717178100429
Stack Overflow reports strong growth from COVID-19 workplace changes
https://news.google.com./articles/CBMiY2h0dHBzOi8vd3d3LnpkbmV0LmNvbS9hcnRpY2xlL3N0YWNrLW92ZXJmbG93LXJlcG9ydHMtc3Ryb25nLWdyb3d0aC1mcm9tLWNvdmlkLTE5LXdvcmtwbGFjZS1jaGFuZ2VzL9IBbmh0dHBzOi8vd3d3LnpkbmV0LmNvbS9nb29nbGUtYW1wL2FydGljbGUvc3RhY2stb3ZlcmZsb3ctcmVwb3J0cy1zdHJvbmctZ3Jvd3RoLWZyb20tY292aWQtMTktd29ya3BsYWNlLWNoYW5nZXMv?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Stack Overflow gets $85M Series E funding to expand SaaS product
https://news.google.com./articles/CBMiZWh0dHBzOi8vd3d3LnRlY2hyZXB1YmxpYy5jb20vYXJ0aWNsZS9zdGFjay1vdmVyZmxvdy1nZXRzLTg1bS1zZXJpZXMtZS1mdW5kaW5nLXRvLWV4cGFuZC1zYWFzLXByb2R1Y3Qv0gFwaHR0cHM6Ly93d3cudGVjaHJlcHVibGljLmNvbS9nb29nbGUtYW1wL2FydGljbGUvc3RhY2stb3ZlcmZsb3ctZ2V0cy04NW0tc2VyaWVzLWUtZnVuZGluZy10by1leHBhbmQtc2Fhcy1wcm9kdWN0Lw?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Stack Overflow expands its Teams service with new integrations
https://news.google.com./articles/CAIiELYwgKfey9CR8SXDOjhn0zMqFAgEKg0IACoGCAowlIEBMLEXMOc_?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
MongoDB: A Database For The New Era
https://news.google.com./articles/CAIiEC1Lr6ctqyb6pi2PkH_xsfAqFggEKg0IACoGCAowkqEGMJBZMLLouwY?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
EXCLUSIVE: Communities, brand and then product — CEO explains how Stack Overflow flipped the script on softwar
https://news.google.com./articles/CBMivQFodHRwczovL3d3dy5idXNpbmVzc2luc2lkZXIuaW4vdGVjaC9lbnRlcnByaXNlL25ld3Mvc3RhY2stb3ZlcmZsb3ctY2VvLXByYXNoYW50aC1jaGFuZHJhc2VrYXItZXhwbGFpbnMtaG93LXN0YWNrLW92ZXJmbG93LWZsaXBwZWQtdGhlLXNjcmlwdC1vbi1zb2Z0d2FyZS1kZXZlbG9wbWVudC9hcnRpY2xlc2hvdy83ODg1NDU2NC5jbXPSAcEBaHR0cHM6Ly93d3cuYnVzaW5lc3NpbnNpZGVyLmluL3RlY2gvZW50ZXJwcmlzZS9uZXdzL3N0YWNrLW92ZXJmbG93LWNlby1wcmFzaGFudGgtY2hhbmRyYXNla2FyLWV4cGxhaW5zLWhvdy1zdGFjay1vdmVyZmxvdy1mbGlwcGVkLXRoZS1zY3JpcHQtb24tc29mdHdhcmUtZGV2ZWxvcG1lbnQvYW1wX2FydGljbGVzaG93Lzc4ODU0NTY0LmNtcw?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Finished searching for 'google' in 2.2639846410020255
Google Home: 5 ways to fix the issue when Google Assistant doesn't understand you
https://news.google.com./articles/CAIiEEH9NNUgXleahne_qOb-cRcqFQgEKgwIACoFCAow4GowoAgwhuCMBg?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Pocket Casts is up for sale nearly three years after acquisition by public radio consortium
https://news.google.com./articles/CAIiEE0kkabL7el7Et9iUgNvaFoqGQgEKhAIACoHCAowyoD5CjD5z-ACMM_rvwU?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Google reportedly requires new Android TV devices support AV1 video decoding
https://news.google.com./articles/CBMiUWh0dHBzOi8vd3d3LnhkYS1kZXZlbG9wZXJzLmNvbS9nb29nbGUtcmVxdWlyZXMtbmV3LWFuZHJvaWQtdHYtYXYxLXZpZGVvLWRlY29kaW5nL9IBVWh0dHBzOi8vd3d3LnhkYS1kZXZlbG9wZXJzLmNvbS9nb29nbGUtcmVxdWlyZXMtbmV3LWFuZHJvaWQtdHYtYXYxLXZpZGVvLWRlY29kaW5nL2FtcC8?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Google previewing new Nest Hub alarms with more customization, tones, and ‘Sunrise’
https://news.google.com./articles/CAIiEPFJ-0ZIW0D3KmRfgtg-QLkqGQgEKhAIACoHCAowyoD5CjD5z-ACMM_rvwU?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Google 'throwing its weight around' by burying links to some commercial news sites, experts say
https://news.google.com./articles/CAIiEMds5Hwsm-sBaVKmnmg4yZ8qFggEKg4IACoGCAow3vI9MPeaCDDciw4?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Finished main in 6.28738788398914
或者,您可以使用SerpApi访问从 Google 新闻中提取的数据。它有免费试用版。
免责声明:我在 SerpApi 工作。