首页 > 解决方案 > 在 Python 中从网络抓取中添加数据


我有这个用于从 understat.com 获取 xG 值的 python 脚本(特别感谢@chitown88)。

我想在比赛中至少收到一个红旗的球队名称中添加一个星号 (*)。例如,在https://understat.com/match/9458中,哈德斯菲尔德收到一张红牌,因此在输出中是否可以在名称旁边有一个 *,即哈德斯菲尔德 *。



import requests
import json
import re
from pandas.io.json import json_normalize
import pandas as pd

response = requests.get('https://understat.com/match/9458')

shotsData = re.search("shotsData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(shotsData.groups()[0], 'utf-8').decode('unicode_escape')
shotsObj = json.loads(decoded_string)

match_info = re.search("match_info\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(match_info.groups()[0], 'utf-8').decode('unicode_escape')
matchObj = json.loads(decoded_string)

rostersData = re.search("rostersData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(rostersData.groups()[0], 'utf-8').decode('unicode_escape')
rostersObj = json.loads(decoded_string)

# Shots Data into a DataFrame
away_shots_df = json_normalize(shotsObj['a'])
home_shots_df = json_normalize(shotsObj['h'])
shots_df = away_shots_df.append(home_shots_df)

# Rosters Data into a DataFrame
away_rosters_df = pd.DataFrame()
for key, v in rostersObj['a'].items():
    temp_df = pd.DataFrame.from_dict([v])
    away_rosters_df = away_rosters_df.append(temp_df)

home_rosters_df = pd.DataFrame()
for key, v in rostersObj['h'].items():
    temp_df = pd.DataFrame.from_dict([v])
    home_rosters_df = home_rosters_df.append(temp_df)

rosters_df = away_rosters_df.append(home_rosters_df)

teams_dict = {'a':matchObj['team_a'], 'h':matchObj['team_h']}
match_title = matchObj['team_h'] + ' vs. ' + matchObj['team_a']

#print (shots_df)

# Cumulative chart of xG from the shotsData
import numpy as np

# Convert 'minute' astype int and sort the dataframe by 'minute'
shots_df['minute'] = shots_df['minute'].astype(int)
shots_df['xG'] = shots_df['xG'].astype(float)

timing_chart_df = shots_df[['h_a', 'minute', 'xG']].sort_values('minute')
timing_chart_df['h_a'] = timing_chart_df['h_a'].map(teams_dict)

# Get max value of the 'minute' column to interpolate minute interval between that range
max_value = timing_chart_df['minute'].max()

# Aggregate xG within the same minute
timing_chart_df = timing_chart_df.groupby(['h_a','minute'], as_index=False)['xG'].sum()

# Interpolate for each team/group
min_idx = np.arange(timing_chart_df['minute'].max() + 1)
m_idx = pd.MultiIndex.from_product([timing_chart_df['h_a'].unique(), min_idx], names=['h_a', 'minute'])

# Calculate the running sum
timing_chart_df = timing_chart_df.set_index(['h_a', 'minute']).reindex(m_idx, fill_value=0).reset_index()
timing_chart_df['running_sum_xG'] = timing_chart_df.groupby('h_a')['xG'].cumsum()

timing_chart_T_df = timing_chart_df.pivot(index='h_a', columns='minute', values='running_sum_xG')
timing_chart_T_df = timing_chart_T_df.reset_index().rename(columns={timing_chart_T_df.index.name:match_title})

print (timing_chart_T_df.to_string())

标签: pythonweb-scraping




if away_rosters_df['red_card'].astype(int).sum() > 0:
    a_red_card = '*'
    a_red_card = ''

if home_rosters_df['red_card'].astype(int).sum() > 0:
    h_red_card = '*'
    h_red_card = ''


teams_dict = {'a':matchObj['team_a']+a_red_card, 'h':matchObj['team_h']+h_red_card}


import requests
import json
import re
from pandas.io.json import json_normalize
import pandas as pd

response = requests.get('https://understat.com/match/9458')

shotsData = re.search("shotsData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(shotsData.groups()[0], 'utf-8').decode('unicode_escape')
shotsObj = json.loads(decoded_string)

match_info = re.search("match_info\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(match_info.groups()[0], 'utf-8').decode('unicode_escape')
matchObj = json.loads(decoded_string)

rostersData = re.search("rostersData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(rostersData.groups()[0], 'utf-8').decode('unicode_escape')
rostersObj = json.loads(decoded_string)

# Shots Data into a DataFrame
away_shots_df = json_normalize(shotsObj['a'])
home_shots_df = json_normalize(shotsObj['h'])
shots_df = away_shots_df.append(home_shots_df)

# Rosters Data into a DataFrame
away_rosters_df = pd.DataFrame()
for key, v in rostersObj['a'].items():
    temp_df = pd.DataFrame.from_dict([v])
    away_rosters_df = away_rosters_df.append(temp_df)

home_rosters_df = pd.DataFrame()
for key, v in rostersObj['h'].items():
    temp_df = pd.DataFrame.from_dict([v])
    home_rosters_df = home_rosters_df.append(temp_df)    

rosters_df = away_rosters_df.append(home_rosters_df) 

if away_rosters_df['red_card'].astype(int).sum() > 0:
    a_red_card = '*'
    a_red_card = ''

if home_rosters_df['red_card'].astype(int).sum() > 0:
    h_red_card = '*'
    h_red_card = ''

teams_dict = {'a':matchObj['team_a']+a_red_card, 'h':matchObj['team_h']+h_red_card}
match_title = matchObj['team_h'] + ' vs. ' + matchObj['team_a']

# Timing Chart is an aggregation (running sum) of xG from the shotsData
import numpy as np

# Convert 'minute' astype int and sort the dataframe by 'minute'
shots_df['minute'] = shots_df['minute'].astype(int)
shots_df['xG'] = shots_df['xG'].astype(float)

timing_chart_df = shots_df[['h_a', 'minute', 'xG']].sort_values('minute')
timing_chart_df['h_a'] = timing_chart_df['h_a'].map(teams_dict)

# Get max value of the 'minute' column to interpolate minute interval between that range
max_value = timing_chart_df['minute'].max()

# Aggregate xG within the same minute
timing_chart_df = timing_chart_df.groupby(['h_a','minute'], as_index=False)['xG'].sum()

# Interpolate for each team/group
min_idx = np.arange(timing_chart_df['minute'].max() + 1)
m_idx = pd.MultiIndex.from_product([timing_chart_df['h_a'].unique(), min_idx], names=['h_a', 'minute'])

# Calculate the running sum
timing_chart_df = timing_chart_df.set_index(['h_a', 'minute']).reindex(m_idx, fill_value=0).reset_index()
timing_chart_df['running_sum_xG'] = timing_chart_df.groupby('h_a')['xG'].cumsum()

timing_chart_T_df = timing_chart_df.pivot(index='h_a', columns='minute', values='running_sum_xG')
timing_chart_T_df = timing_chart_T_df.reset_index().rename(columns={timing_chart_T_df.index.name:match_title})

from datetime import datetime

home_team = matchObj['team_h']+h_red_card
away_team = matchObj['team_a']+a_red_card

league = matchObj['league']
season = matchObj['season']
date = matchObj['date']
datetime_object = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
date = datetime_object.strftime('%A, %B %d, %Y')

results_df = pd.DataFrame([[league, season, date, home_team, away_team]], columns = ['League','Season','Date','Home team','Away team'])

home_xg_sum = timing_chart_df[timing_chart_df['h_a'] == home_team].pivot(index='h_a', columns='minute', values='running_sum_xG')
away_xg_sum = timing_chart_df[timing_chart_df['h_a'] == away_team].pivot(index='h_a', columns='minute', values='running_sum_xG')

data = [league, season, date, home_team, away_team] + home_xg_sum.values.tolist()[0] + away_xg_sum.values.tolist()[0]
cols =  ['League','Season','Date','Home team','Away team'] + list(home_xg_sum.columns) + list(away_xg_sum.columns)

results_df = pd.DataFrame([data], columns = cols)


  League Season                         Date         Home team      Away team    0    1    2    3    4    5    6         7         8         9        10        11        12        13        14       15       16       17        18        19        20        21        22        23        24        25        26        27        28        29        30        31        32        33        34        35        36        37        38        39        40        41        42       43       44       45       46        47        48        49        50        51        52        53        54        55        56        57        58        59        60        61        62        63        64        65        66        67        68        69        70        71        72        73        74        75        76        77        78        79        80        81        82        83        84        85    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30        31        32        33        34        35        36        37        38        39        40        41        42        43        44        45        46        47        48        49        50        51        52        53        54        55        56        57        58        59        60        61        62        63        64        65        66        67        68        69        70        71        72        73        74        75        76        77        78        79        80        81        82        83        84        85
0    EPL   2018  Saturday, February 23, 2019  Newcastle United  Huddersfield*  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.087855  0.087855  0.087855  0.087855  0.474551  0.474551  0.474551  0.474551  0.52089  0.52089  0.52089  0.588242  0.588242  0.588242  0.588242  0.588242  0.588242  0.588242  0.650563  0.650563  0.650563  0.713521  0.765269  0.765269  0.765269  0.765269  0.765269  0.765269  0.765269  0.765269  0.765269  0.780235  0.862191  0.862191  0.862191  0.972581  1.00803  1.00803  2.01324  2.01324  2.103931  2.103931  2.103931  2.103931  2.248354  2.248354  2.248354  2.278213  2.278213  2.278213  2.278213  2.278213  2.278213  2.397133  2.397133  2.397133  2.397133  2.397133  2.397133  2.484387  2.484387  2.624275  2.624275  2.755339  2.868987  2.868987  2.868987  2.868987  3.011753  3.011753  3.011753  3.011753  3.011753  3.011753  3.011753  3.011753  3.026651  3.026651  3.026651  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.110397  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.120421  0.133949
