python - 提取游戏数据:不正确的值
问题描述
我对 BeautifulSoup 很陌生
我正在努力从类似的代码中获取游戏数据,但我不理解语法和放置。
此代码适用于一页:
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
urls = {
"https://www.oddsportal.com/soccer/australia/a-league/results/#/page/1/",
}
if __name__ == '__main__':
results = None
for url in urls:
try:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
except ValueError:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
except AttributeError:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(tabulate(results.head(), headers='keys', tablefmt="github"))
结果:
| | date | time | game | score | home_odds | draw_odds | away_odds | country | league |
|----|-------------------|--------|------------------------------------------|---------|-------------|-------------|-------------|-----------|----------|
| 0 | Yesterday, 03 Apr | 09:05 | Brisbane Roar - WS Wanderers | 1:1 | 2.38 | 3.65 | 2.79 | Australia | A-League |
| 1 | 02 Apr 2021 | 08:05 | Macarthur FC - Perth Glory | 2:0 | 2.01 | 3.66 | 3.57 | Australia | A-League |
| 2 | 01 Apr 2021 | 09:40 | Central Coast Mariners - Adelaide United | 2:1 | 2.1 | 3.8 | 3.17 | Australia | A-League |
| 3 | 01 Apr 2021 | 07:35 | Western United - Melbourne City | 2:1 | 5.95 | 4.61 | 1.49 | Australia | A-League |
| 4 | 28 Mar 2021 | 08:10 | Adelaide United - Sydney FC | 1:0 | 2.97 | 3.72 | 2.24 | Australia | A-League |
我修改了另一个页面的代码:
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
# The competition and the country was earlier for the same for the entire page. Now its different for every match. Hos can I capture that data?
country = count[0].text
league = count[1].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
urls = {
"https://www.oddsportal.com/matches/soccer/20210405/",
}
……
我得到了同一个国家和所有人的竞争,而它却不一样。
| | date | time | game | score | home_odds | draw_odds | away_odds | country | league |
|----|-----------------------------|--------|--------------------------------|--------------------------------|-------------|-------------|-------------|-----------|-----------------------------|
| 0 | | 00:00 | Racing Club - Godoy Cruz | Racing Club - Godoy Cruz | 1.74 | 3.63 | 4.56 | Argentina | Copa de la Liga Profesional |
| 1 | Costa Rica»Primera Division | 00:00 | Sporting San Jose - San Carlos | Sporting San Jose - San Carlos | 2.65 | 2.85 | 2.81 | Argentina | Copa de la Liga Profesional |
| 2 | Guatemala»Liga Nacional | 00:00 | Comunicaciones - Iztapa | Comunicaciones - Iztapa | 1.49 | 3.83 | 5.83 | Argentina | Copa de la Liga Profesional |
| 3 | Mexico»Liga MX | 00:00 | Queretaro - U.A.N.L.- Tigres | Queretaro - U.A.N.L.- Tigres | 3.34 | 3.29 | 2.18 | Argentina | Copa de la Liga Profesional |
| 4 | Colombia»Primera A | 01:00 | Junior - Aguilas | Junior - Aguilas | 1.66 | 3.3 | 5.99 | Argentina | Copa de la Liga Profesional
|
如何编辑代码?
我了解页面的 Xpath 与工作的不同,即
//*[@id="table-matches"]/table/tbody/tr[2]/td[2]/a[2]
解决方案
推荐阅读
- airflow-scheduler - Timedeltasensor 从计划间隔延迟
- java - 我有一个用于 kafka 连接的 kafka 管道(json 问题更新)
- java - 无法使用 Morphia 2.0.2 检索现有文档
- c# - Microsoft Graph API 错误:MailboxNotEnabledForRestAPI
- javascript - Javascript / 验证输入
- sql-server - 使用 ORDER BY 的查询如何比没有排序的相同查询运行得更快?
- spring-boot - Git commit 可以立即运行,但在 intellij 中 gradle clean build 需要将近 5 分钟。重新同步在几秒钟内发生
- npm - npm 安装错误“无法读取未定义的属性‘最新’”
- r - 将每个变量存储到列表元素中
- jupyter-notebook - 无法在受限数据处理设置中安装 JUPYTER