python - 将抓取的表存储为字典并输出为 pandas DataFrame
问题描述
我从下面给出的网站上抓取了一些数据。我无法在 excel 上输出这些数据。另外,我已经将我抓取的表格存储为字典。但是键值对不同步。有人请帮忙。
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = requests.get("http://stats.espncricinfo.com/ci/content/records/307847.html" )
soup = bs(url.text, 'lxml')
soup_1 = soup.find(class_ = "recordsTable")
soup_pages = soup_1.find_all('a', href= True)
state_links =[]
for link in soup_pages:
state_links.append(link['href'])
for i in state_links:
parse_link = "http://stats.espncricinfo.com"+i
url_new = requests.get(parse_link)
soup_new = bs(url_new.text, 'lxml')
soup_table = soup_new.find(class_="engineTable")
results = {}
newdict = dict()
for col in soup_table.findAll('th'):
colname = (col.text).lstrip().rstrip()
for row in soup_table.findAll("td"):
rowname = row.text.lstrip().rstrip()
newdict[col.text] = row.text
print (newdict)
解决方案
您正在迭代列表并将其存储在每次迭代时都会覆盖的同一个变量中。试试下面的代码,我认为它会起作用。
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url =requests.get("http://stats.espncricinfo.com/ci/content/records/307847.html" )
soup = bs(url.text, 'lxml')
soup_1 = soup.find(class_ = "recordsTable")
soup_pages = soup_1.find_all('a', href= True)
state_links =[]
state_id =[]
for link in soup_pages:
state_links.append(link['href'])
state_id.append(link.getText())
Total_dict = dict()
for a,year in zip(state_links,state_id):
parse_link = "http://stats.espncricinfo.com"+a
url_new = requests.get(parse_link)
soup_new = bs(url_new.text, 'lxml')
soup_table = soup_new.find(class_="engineTable")
newdictlist = list()
col_name =list()
row_name =list()
for col in soup_table.findAll('th'):
col_name.append((col.text).lstrip().rstrip())
for row in soup_table.findAll("td"):
row_name.append(row.text.lstrip().rstrip())
no_of_matches = len(row_name)/len(col_name)
row_count=0
for h in range(int(no_of_matches)):
newdict = dict()
for i in col_name:
newdict[i] = row_name[row_count]
row_count=row_count+1
newdictlist.append(newdict)
print(newdictlist)
Total_dict[year] = newdictlist
print(Total_dict)
输出:{'1877': [{'Team 1': 'Australia', 'Team 2': 'England', 'Winner': 'Australia', 'Margin': '45 runs', 'Ground': 'Melbourne ', '比赛日期': 'Mar 15-19, 1877', '记分卡': 'Test #1'}, {'Team 1': 'Australia', 'Team 2': 'England', 'Winner': '英格兰','保证金':'4个小门','地面':'墨尔本','比赛日期':'Mar 31-Apr 4,1877','记分卡':'测试#2'}],[' 1879':[{'Team 1':'Australia','Team 2':'England','Winner':'Australia','Margin':'10 wickets','Ground':'Melbourne','Match日期':'1879 年 1 月 2-4 日','记分卡':'测试#3'}],............}