python - Python 使用多个浏览器执行脚本 Selenium

问题描述

如何使用多个浏览器执行以下脚本？

每个都n urls应该使用单独的浏览器执行。我应该能够定义n（并行抓取）的值

import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver

browser = webdriver.Chrome()

class GameData:

    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []

def parse_data(url):
    while True:
        try:
            browser.get(url)
            df = pd.read_html(browser.page_source)[0]
            break
        except KeyError:
            browser.quit()
            continue
    html = browser.page_source
    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
    main = content.find('th', {'class': 'first2 tl'})
    if main is None:
        return None
    count = main.findAll('a')
    country = count[1].text
    league = count[2].text
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.date.append(game_date)
        game_data.time.append(row[1])
        game_data.game.append(row[2])
        game_data.score.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])
        game_data.country.append(country)
        game_data.league.append(league)
    return game_data

# URLs go here
urls = {
    "https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
    "https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
    "https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
    "https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
    "https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
    "https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
    "https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
    "https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
    "https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
    "https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}

if __name__ == '__main__':

    results = None

    for url in urls:
        try:
            game_data = parse_data(url)
            if game_data is None:
                continue
            result = pd.DataFrame(game_data.__dict__)
            if results is None:
                results = result
            else:
                results = results.append(result, ignore_index=True)

print(results)

目前该脚本对所有用户使用一个浏览器窗口urls

如何修改代码以为每个人打开多个浏览器事件以n urls更快地完成相同的工作，然后追加到results.

标签： pythonbeautifulsoupmultiprocessingselenium-chromedriver

解决方案

使用DevToolin Chrome/Firefox(tab: Network, filters: JS, XHR) 我发现页面用于从服务器获取数据的 url 使用AJAX.

https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/

etc.

网址类似。不同之处在于xbNfvuAM，l8FEjeUE我在代码中找到了PageTournament({"id":"l8FEjeUE", ...，我可以生成这些 url。

这样我就可以创建HTML不使用Selenium但只使用的代码requests。

需要原始代码，~20s并且requests只需要~6s.

顺便说一句：我还减少了代码，parse_data只在DataFrame没有类的情况下使用GameData

import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
from multiprocessing import Pool

# --- functions ---

def get_html(url):
    r = requests.get(url, headers=headers)
    text = r.text
    start = text.find('PageTournament({"id":"') + len('PageTournament({"id":"')
    end = text.find('"', start)
    code = text[start:end]
    print(f'code: {code}')

    url = f'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/{code}/X0/1/0/1/'

    r = requests.get(url, headers=headers)
    text = r.text

    # remove `globals.jsonpCallback('...',` at the start
    text = text.split(',', 1)[1]
    text = text[:-2]              # remove `);` at the end

    # print('json:', text[:25], '...', text[-25:])  # may display partially because other processes my put own text
    print(f'json: {text[:25]} ... {text[-25:]}')  # display all in one peace

    data = json.loads(text)
    html = data['d']['html']

    # print('html:', html[:25], '...', html[-25:])  # may display partially because other processes my put own text
    # may display partially because other processes my put own text
    print(f'html: {html[:25]} ... {html[-25:]}')

    return html


def parse_data(html):
    try:
        df = pd.read_html(html)[0]
    except KeyError:
        print('KeyError')
        return

    soup = bs(html, "lxml")
    header = soup.select('table th.first2.tl a')

    if not header:
        return

    df['country'] = header[1].text
    df['league'] = header[2].text

    return df


def process(url):
    return parse_data(get_html(url))

# --- main ---

# needed headers - on some systems it has to be outside `__main__`

headers = {
    'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'referer': 'https://www.oddsportal.com/',
}

if __name__ == '__main__':

    # urls for AJAX requests
    # ajax_urls = {
    #    # for 'view-source:https://www.oddsportal.com/soccer/romania/superliga-women/results/#/'
    #    'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/',
    #    # for 'https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/'
    #    'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/',
    # }
    # you can find `l8FEjeUE` in oriiginal page as `PageTournament({"id":"l8FEjeUE", ...`

    urls = {
        "https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
        "https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
        "https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
        "https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
        "https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
        "https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
        "https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
        "https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
        "https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
        "https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
    }

    time_start = time.time()

    # empty `DataFrame` so I don't have to check `if results is None`
    results = pd.DataFrame()

    with Pool(10) as p:
        all_game_data = p.map(process, urls)

    for game_data in all_game_data:

        if game_data is None:
            #print('game_data', game_data)
            continue

        results = results.append(game_data, ignore_index=True)

    time_end = time.time()
    time_diff = (time_end - time_start)

    print(f'time: {time_diff:.2f} s')

    print('--- results ---')
    print(results)

编辑：

正如@αԋɱҽԃαмєяιcαη 发现headers必须在外面__main__，因为在某些系统上它可能会引发错误。

文档：多处理

编辑：

我创建了使用多处理来运行原始代码的代码。

问题是它不能发送browser到每个进程都必须运行自己的进程Selenium并且它同时显示 5 个浏览器。而且它需要更多时间来启动所有浏览器，这让我很费时间~40s。

也许如果使用队列运行进程以获取URL和发送回，HTML那么它可以重用一个浏览器（或几个浏览器同时运行它们）。但它需要更复杂的代码。

import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
from multiprocessing import Pool

# --- classes ---

class GameData:

    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []

# --- functions ---

def parse_data(url):
    browser = webdriver.Chrome()
    
    while True:
        try:
            browser.get(url)
            df = pd.read_html(browser.page_source)[0]
            break
        except KeyError:
            print('KeyError:', url)
            continue
            
    html = browser.page_source
    browser.quit()            

    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
    main = content.find('th', {'class': 'first2 tl'})
    if main is None:
        return None
    count = main.findAll('a')
    country = count[1].text
    league = count[2].text
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.date.append(game_date)
        game_data.time.append(row[1])
        game_data.game.append(row[2])
        game_data.score.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])
        game_data.country.append(country)
        game_data.league.append(league)
    return game_data

# --- main ---

if __name__ == '__main__':

    # URLs go here
    urls = {
        "https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
        "https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
        "https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
        "https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
        "https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
        "https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
        "https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
        "https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
        "https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
        "https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
    }


    time_start = time.time()
    
    results = None
    
    with Pool(5) as p:
        all_game_data = p.map(parse_data, urls)
        
    for game_data in all_game_data:
            
        if game_data is None:
            #print('game_data', game_data)
            continue
        
        result = pd.DataFrame(game_data.__dict__)
        
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)

    time_end = time.time()
    time_diff = (time_end - time_start)
    print(f'time: {time_diff:.2f} s')
    
    print('--- results ---')
    print(results)

python - Python 使用多个浏览器执行脚本 Selenium

问题描述

解决方案

推荐阅读