pandas - Selenium Loop 将多个表附加在一起
问题描述
我是这里的新 python 用户。我一直在写一个代码,它使用硒和美丽的汤去一个网站并获取 html 表并将其转换为数据框。
我正在使用 selenium 循环浏览许多不同的页面和漂亮的汤以从那里收集表格。
我遇到的问题是我无法让所有这些表相互附加。如果我打印出数据框,它只会打印最后一个被刮掉的表。如何告诉 beautifulsoup 将一个数据框附加到另一个数据框的底部?
任何帮助将不胜感激,在这一小部分已经有几天了。
states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia",
"Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
"Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
"New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon",
"Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia",
"Washington", "West Virginia", "Wisconsin", "Wyoming"]
period = "2020"
num_states = len(states)
state_list = []
for state in states:
driver = webdriver.Chrome(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
df = pd.DataFrame(columns = headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
df = pd.DataFrame.rename(columns={'Total Acres':'Total_acres'})
for i in range(s,num_states):
state_list.append([County[i].text, Payment[i].text, Total_acres[i].text])
print(df)
******************** 编辑 ************************* 期间 = “2020”
num_states = len(状态)
state_list = []
df = pd.DataFrame()
对于状态中的状态: driver = webdriver.Chrome(executable_path = 'C:/webdrivers/chromedriver.exe') driver.get('https://www.nbc.gov/pilt/counties.cfm') driver.implicitly_wait( 20) state_s = driver.find_element(By.NAME, 'state_code') drp = Select(state_s) drp.select_by_visible_text(state) year_s = driver.find_element(By.NAME, 'fiscal_yr') drp = Select(year_s) drp. select_by_visible_text(period) driver.implicitly_wait(10) link = driver.find_element(By.NAME, 'Search') link.click() url = driver.current_url page = requests.get(url) #dfs = pd.read_html(addrss )[2] # 获取 html 汤 = BeautifulSoup(page.text, 'lxml') table = soup.findAll('table')[2] headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
dfs = pd.concat([df for state in states])
打印(df)
结果:ValueError:无法设置没有定义列的框架
解决方案
通过 pandas 访问表!请参考已添加的行的评论。
states = ["Alabama", "Alaska"]
period = "2020"
num_states = len(states)
state_list = []
driver = webdriver.Chrome()
result=[] # change 1 , list to store the {state:df}
for state in states:
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
temp_res={}
soup = BeautifulSoup(driver.page_source, 'lxml')
df_list=pd.read_html(soup.prettify(),thousands=',,') # access the table through pandas
try:
df_list[2].drop('PAYMENT.1', axis=1, inplace=True) # some states giving this column , so deleted
except:
print(f"state: {state} does have payment 1")
try:
df_list[2].drop('PAYMENT.2', axis=1, inplace=True) # some states giving this column , so deleted
except:
print(f"state: {state} does have payment 2")
temp_res[state]=df_list[2] # the table at occurance 2
result.append(temp_res)
输出如下所示:
for each_run in result :
for each_state in each_run:
print(each_run[each_state].head(1))
COUNTY PAYMENT TOTAL ACRES
0 AUTAUGA COUNTY $4,971 1,758
COUNTY PAYMENT TOTAL ACRES
0 ALEUTIANS EAST BOROUGH $668,816 2,663,160
推荐阅读
- node.js - How can I delete not verified emails in Firebase in web?
- javascript - 将布尔值从 vue 路由器传递到其组件
- react-native - realm react-native - 2 different databases/schemas
- aws-security-group - 如何限制 AWS 安全组出站规则
- javascript - Electron:在 Windows 上禁用粘贴
- php - 获取异常
- java - 代码运行,但它说的是错误的答案,无法继续使用我不明白的消息
- sql - 如何使用 SqlQuerySpec 查询 CosmosDb 的多个结果
- java - Java映射问题
- node.js - 节点 - 鸟 API