python - Python 使用多个浏览器执行脚本 Selenium
问题描述
如何使用多个浏览器执行以下脚本?
每个都n
urls
应该使用单独的浏览器执行。我应该能够定义n
(并行抓取)的值
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
while True:
try:
browser.get(url)
df = pd.read_html(browser.page_source)[0]
break
except KeyError:
browser.quit()
continue
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
if __name__ == '__main__':
results = None
for url in urls:
try:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
目前该脚本对所有用户使用一个浏览器窗口urls
如何修改代码以为每个人打开多个浏览器事件以n
urls
更快地完成相同的工作,然后追加到results
.
解决方案
使用DevTool
in Chrome/Firefox
(tab: Network
, filters: JS, XHR
) 我发现页面用于从服务器获取数据的 url 使用AJAX
.
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/
etc.
网址类似。不同之处在于xbNfvuAM
,l8FEjeUE
我在代码中找到了PageTournament({"id":"l8FEjeUE", ...
,我可以生成这些 url。
这样我就可以创建HTML
不使用Selenium
但只使用的代码requests
。
需要原始代码,~20s
并且requests
只需要~6s
.
顺便说一句:我还减少了代码,parse_data
只在DataFrame
没有类的情况下使用GameData
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
from multiprocessing import Pool
# --- functions ---
def get_html(url):
r = requests.get(url, headers=headers)
text = r.text
start = text.find('PageTournament({"id":"') + len('PageTournament({"id":"')
end = text.find('"', start)
code = text[start:end]
print(f'code: {code}')
url = f'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/{code}/X0/1/0/1/'
r = requests.get(url, headers=headers)
text = r.text
# remove `globals.jsonpCallback('...',` at the start
text = text.split(',', 1)[1]
text = text[:-2] # remove `);` at the end
# print('json:', text[:25], '...', text[-25:]) # may display partially because other processes my put own text
print(f'json: {text[:25]} ... {text[-25:]}') # display all in one peace
data = json.loads(text)
html = data['d']['html']
# print('html:', html[:25], '...', html[-25:]) # may display partially because other processes my put own text
# may display partially because other processes my put own text
print(f'html: {html[:25]} ... {html[-25:]}')
return html
def parse_data(html):
try:
df = pd.read_html(html)[0]
except KeyError:
print('KeyError')
return
soup = bs(html, "lxml")
header = soup.select('table th.first2.tl a')
if not header:
return
df['country'] = header[1].text
df['league'] = header[2].text
return df
def process(url):
return parse_data(get_html(url))
# --- main ---
# needed headers - on some systems it has to be outside `__main__`
headers = {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
'referer': 'https://www.oddsportal.com/',
}
if __name__ == '__main__':
# urls for AJAX requests
# ajax_urls = {
# # for 'view-source:https://www.oddsportal.com/soccer/romania/superliga-women/results/#/'
# 'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/',
# # for 'https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/'
# 'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/',
# }
# you can find `l8FEjeUE` in oriiginal page as `PageTournament({"id":"l8FEjeUE", ...`
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
time_start = time.time()
# empty `DataFrame` so I don't have to check `if results is None`
results = pd.DataFrame()
with Pool(10) as p:
all_game_data = p.map(process, urls)
for game_data in all_game_data:
if game_data is None:
#print('game_data', game_data)
continue
results = results.append(game_data, ignore_index=True)
time_end = time.time()
time_diff = (time_end - time_start)
print(f'time: {time_diff:.2f} s')
print('--- results ---')
print(results)
编辑:
正如@αԋɱҽԃαмєяιcαη 发现headers
必须在外面__main__
,因为在某些系统上它可能会引发错误。
文档:多处理
编辑:
我创建了使用多处理来运行原始代码的代码。
问题是它不能发送browser
到每个进程都必须运行自己的进程Selenium
并且它同时显示 5 个浏览器。而且它需要更多时间来启动所有浏览器,这让我很费时间~40s
。
也许如果使用队列运行进程以获取URL
和发送回,HTML
那么它可以重用一个浏览器(或几个浏览器同时运行它们)。但它需要更复杂的代码。
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
from multiprocessing import Pool
# --- classes ---
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
# --- functions ---
def parse_data(url):
browser = webdriver.Chrome()
while True:
try:
browser.get(url)
df = pd.read_html(browser.page_source)[0]
break
except KeyError:
print('KeyError:', url)
continue
html = browser.page_source
browser.quit()
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# --- main ---
if __name__ == '__main__':
# URLs go here
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
time_start = time.time()
results = None
with Pool(5) as p:
all_game_data = p.map(parse_data, urls)
for game_data in all_game_data:
if game_data is None:
#print('game_data', game_data)
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
time_end = time.time()
time_diff = (time_end - time_start)
print(f'time: {time_diff:.2f} s')
print('--- results ---')
print(results)
推荐阅读
- r - 访问目录中的所有文件并使用 r studio 执行 rscript
- node.js - Node JS 的问题和承诺在完成前返回
- pandoc - Pandoc:每个部分的单独目录
- android - 如何以编程方式在我的应用程序中创建 APK?
- c# - Check in Azure Functions if client is still connected to SignalR Service
- react-spring - react-spring@next - 一个带有异步功能的弹簧作为“to”动画每个渲染
- sparql - Apache Jena 在使用 VALUES 块执行 SPARQL 查询时引发异常
- c - 在C中的char []之前和之后附加'
- python - 查找和替换熊猫数据框中的坏字符
- maven - Maven Failsafe:排除作为测试套件一部分的测试