python - 从维基百科获取和解析数据
问题描述
我正在尝试从维基百科获取和解析一些数据,以进行分析。我正在尝试提取数据以分析列中的因素与幸福分数本身之间的相关性。
但是该文件没有按预期工作。事实上,最终的文件是空的。我尝试调试,但没有成功:
import requests
import bs4
from bs4 import BeautifulSoup
import csv
from csv import DictWriter
def get_page(url):
html_file = requests.get(url)
return html_file
def parse_html(html_file):
parsed_html = bs4.BeautifulSoup(html_file.text, 'html.parser')
t = parsed_html.find_all('table')[1]
tr = t.find_all('tr')[0]
headers = []
for header in tr:
for z in header:
if isinstance(z, bs4.element.NavigableString):
header_name = z.strip()
headers.append(header_name)
headers_original = headers
countries = []
prev_rank = ""
for ln, row in enumerate(t.find_all('tr')[2:]):
country = {}
i = 0
col_values = row.find_all('td')
while i < len(headers):
col = col_values[i]
value = col.text.strip()
if headers[i] in ('Country or region',):
value = str(value)
else:
value = float(value)
country[headers[i]] = value
i += 1
countries.append(country)
return headers, countries
def write_csv(filename, data, fieldnames):
with open(filename, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
def main():
source = "https://en.wikipedia.org/wiki/World_Happiness_Report"
print(f'Fetching URL {source}...')
page = get_page(source)
print(f'Parsing {source}...')
headers, countries = parse_html(page)
print(f'Writing to a CSV file 1...')
write_csv('worldhappiness.csv', countries, headers)
print(f'Parsing table 3 (simpler table) {source} ...')
headers, countries = parse_html(page)
print(f'Writing to a CSV file 2...')
write_csv('worldhappiness2.csv', countries, headers)
print(f'Writing to a CSV file 3...')
write_csv('worldhappiness3.csv', countries, headers)
print(len(countries))
print(len(headers))
if __name__ == "__main__":
main()
解决方案
为此目的使用熊猫要简单得多:
import pandas as pd
tables = pd.read_html('https://en.wikipedia.org/wiki/World_Happiness_Report')
tables[4]
输出是您的目标表。
推荐阅读
- javascript - 从悬停触发的弹出框快速滚动不会隐藏弹出框
- node.js - 将更新的heroku应用程序从本地项目推送到heroku master,redis
- node.js - VSCode Nodejs 调试器不保存更改?
- azure-cosmosdb - 简单 CosmosDb 查询高 RU
- java - 为什么 Mockito @Mock 创建一个非模拟实例?
- r - 基于单独列的唯一值的行索引
- google-bigquery - BigQuery 检查是否存在嵌套字段
- python - 如何在 Django 中将 allauth 用户引用为外键
- regex - 正则表达式识别类别页面但排除产品
- php - 仅在 php 中获取所需的数组