python - 如何让 read_html 循环?Python
问题描述
我目前有以下代码, df2 = df[0] 将代码限制为仅在相应日期收集 1 场比赛的数据。我试图弄清楚如何收集同一天发生的多个游戏数据。
想法是提取一天发生的所有比赛的比赛数据并继续运行整个页面
例如,tables[20] 返回到 html 链接
1) href="/matches/2338970/north-vs-mad-lions-dreamhack-open-leipzig-2020
2) href="/matches/2338968/heroic-vs-mad-lions-dreamhack-open-leipzig-2020
我尝试了以下方法:
for i in range(len(df2)):
df2[i] = df2[i]
但它不会更新每个变量(teamchosen、choose、maps),而只是在其他匹配项中重复第一个匹配项数据(见图)
team_id = 8362
for i in range(0,1):
i = i*100
url = "https://www.hltv.org/results?offset={}&team={}".format(i,team_id)
res = requests.get(url)
soup = bs(res.content, 'lxml')
tables = soup.find_all("div", {"class": "results-sublist"})
index = 0
list_dfs = []
for table in tables:
df = pd.DataFrame(columns=["match", "teamchoose", "chosen", "maps", "team", "opponent", "date"])
df2 = pd.read_html(str(table))
df2 = df2[0]
link = table.find('a', href=True)
link = "https://www.hltv.org/" + link.get('href')
res = requests.get(link)
soup = bs(res.content, 'lxml')
temp = soup.find_all("div", {"class": "padding"})
date = date = pd.to_datetime(int(soup.select(".timeAndEvent div")[0]['data-unix'])*1000000)
out = re.findall(r'<div>\d\.(.*?)</div>', str(temp))
dict_choices = {"teamchoose": [], "chosen": [], "maps": []}
for choice in out[0:6]:
split = choice.strip(" ").split(" ")
dict_choices["teamchoose"].append(" ".join(split[:-2]))
dict_choices["chosen"].append(split[-2])
dict_choices["maps"].append(split[-1])
# df = df.append(dict_choices, True)
# dict_choices = {"turn": [], "choice": [], "maps": []}
try:
left = out[6]
split = left.strip(" ").split(" ")
dict_choices["teamchoose"].append(split[2])
dict_choices["chosen"].append(split[2])
dict_choices["maps"].append(split[0])
except:
pass
df = df.append(pd.DataFrame.from_dict(dict_choices, orient='index').transpose())
df["opponent"] = df2[2].iloc[0]
df["team"] = df2[0].iloc[0]
df["match"] = index
df['date'] = date
list_dfs.append(df)
index +=1
df_out = pd.concat(list_dfs)
df_out = df_out[['match','date','team','opponent','teamchoose','chosen','maps']]
df_out.to_csv("{}_vetoes.csv".format(team_name),index=False)
print(tabulate(df_out, headers='keys', tablefmt='psql'))
解决方案
行。所以你只需要遍历它从表中提取的子表。此外,我还进行了另一项更改。index = 0
您可以使用enumerate()
which 将为您做那种事情,而不是在每个循环之后设置然后递增。看看这是否有效:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import re
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
team_id = 8362
for i in range(0,1):
i = i*100
url = "https://www.hltv.org/results?offset={}&team={}".format(i,team_id)
res = requests.get(url, headers=headers)
soup = bs(res.content, 'html.parser')
tables = soup.find_all("div", {"class": "results-sublist"})
list_dfs = []
for index, table in enumerate(tables):
print ('Page %s:\t%s of %s' %(i+1, index+1, len(tables)))
dfs = pd.read_html(str(table)) #<--- returns all the tables into a list called dfs
for tableIdx, df2 in enumerate(dfs): #<---- add additional loop here
df = pd.DataFrame(columns=["match", "teamchoose", "chosen", "maps", "team", "opponent", "date"])
link = table.find_all('a', href=True)[tableIdx] #<--- Also need to grab correct link for associated table/match if there are more than 1 match
link = "https://www.hltv.org/" + link.get('href')
res = requests.get(link, headers=headers)
soup = bs(res.content, 'lxml')
temp = soup.find_all("div", {"class": "padding"})
date = pd.to_datetime(int(soup.select(".timeAndEvent div")[0]['data-unix'])*1000000)
out = re.findall(r'<div>\d\.(.*?)</div>', str(temp))
dict_choices = {"teamchoose": [], "chosen": [], "maps": []}
for choice in out[0:6]:
split = choice.strip(" ").split(" ")
dict_choices["teamchoose"].append(" ".join(split[:-2]))
dict_choices["chosen"].append(split[-2])
dict_choices["maps"].append(split[-1])
# df = df.append(dict_choices, True)
# dict_choices = {"turn": [], "choice": [], "maps": []}
try:
left = out[6]
split = left.strip(" ").split(" ")
dict_choices["teamchoose"].append(split[2])
dict_choices["chosen"].append(split[2])
dict_choices["maps"].append(split[0])
except:
pass
df = df.append(pd.DataFrame.from_dict(dict_choices, orient='index').transpose())
df["opponent"] = df2[2].iloc[0]
df["team"] = df2[0].iloc[0]
df["match"] = index
df['date'] = date
list_dfs.append(df)
df_out = pd.concat(list_dfs)
df_out = df_out[['match','date','team','opponent','teamchoose','chosen','maps']]
df_out.to_csv("{}_vetoes.csv".format(team_name),index=False)
print(tabulate(df_out, headers='keys', tablefmt='psql'))
推荐阅读
- sublimetext3 - 崇高的文字:主页键不移动到行首
- rest - 视频中的音频识别
- sql-server - 有没有办法将调用包的 SQL 代理作业步骤链接回其在 SSISDB 中的包记录?
- html - 展开按钮以使用 colspan 和 rowspan 覆盖整个单元格
- php - 如何在 symfony 的表单组件的输入字段中限制数字(1-10)?
- google-cloud-platform - GCP服务帐户JWT的多个范围?
- angular - 在分发文件夹中看不到 Angular 环境文件
- javascript - v-bind.sync 不将对象作为道具传递
- c - 如何使这个程序可以接受a字和h字但h字不可重复
- python - 无法使一个整数大于另一个整数