首页 > 解决方案 > Selenium Loop 将多个表附加在一起

问题描述

我是这里的新 python 用户。我一直在写一个代码,它使用硒和美丽的汤去一个网站并获取 html 表并将其转换为数据框。

我正在使用 selenium 循环浏览许多不同的页面和漂亮的汤以从那里收集表格。

我遇到的问题是我无法让所有这些表相互附加。如果我打印出数据框,它只会打印最后一个被刮掉的表。如何告诉 beautifulsoup 将一个数据框附加到另一个数据框的底部?

任何帮助将不胜感激,在这一小部分已经有几天了。

states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia",
"Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", 
"Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
"New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", 
"Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", 
"Washington", "West Virginia", "Wisconsin", "Wyoming"]

period = "2020"

num_states = len(states)

state_list = []

for state in states:
    driver = webdriver.Chrome(executable_path = 'C:/webdrivers/chromedriver.exe')
    driver.get('https://www.nbc.gov/pilt/counties.cfm')
    driver.implicitly_wait(20)
    state_s = driver.find_element(By.NAME, 'state_code')
    drp = Select(state_s)
    drp.select_by_visible_text(state)
    year_s = driver.find_element(By.NAME, 'fiscal_yr')
    drp = Select(year_s)
    drp.select_by_visible_text(period)
    driver.implicitly_wait(10)
    link = driver.find_element(By.NAME, 'Search')
    link.click()
    url = driver.current_url
    page = requests.get(url)
    #dfs  = pd.read_html(addrss)[2]
    # Get the html
    soup = BeautifulSoup(page.text, 'lxml')
    table = soup.findAll('table')[2]
    headers = []

    for i in table.find_all('th'):
        title = i.text.strip()
        headers.append(title)

    df = pd.DataFrame(columns = headers)

    for row in table.find_all('tr')[1:]:
        data = row.find_all('td')
        row_data = [td.text.strip() for td in data]
        length = len(df)
        df.loc[length] = row_data
    df = pd.DataFrame.rename(columns={'Total Acres':'Total_acres'})
    for i in range(s,num_states):
        state_list.append([County[i].text, Payment[i].text, Total_acres[i].text])

print(df)

******************** 编辑 ************************* 期间 = “2020”

num_states = len(状态)

state_list = []

df = pd.DataFrame()

对于状态中的状态: driver = webdriver.Chrome(executable_path = 'C:/webdrivers/chromedriver.exe') driver.get('https://www.nbc.gov/pilt/counties.cfm') driver.implicitly_wait( 20) state_s = driver.find_element(By.NAME, 'state_code') drp = Select(state_s) drp.select_by_visible_text(state) year_s = driver.find_element(By.NAME, 'fiscal_yr') drp = Select(year_s) drp. select_by_visible_text(period) driver.implicitly_wait(10) link = driver.find_element(By.NAME, 'Search') link.click() url = driver.current_url page = requests.get(url) #dfs = pd.read_html(addrss )[2] # 获取 html 汤 = BeautifulSoup(page.text, 'lxml') table = soup.findAll('table')[2] headers = []

for i in table.find_all('th'):
    title = i.text.strip()
    headers.append(title)


for row in table.find_all('tr')[1:]:
    data = row.find_all('td')
    row_data = [td.text.strip() for td in data]
    length = len(df)
    df.loc[length] = row_data


dfs = pd.concat([df for state in states])

打印(df)

结果:ValueError:无法设置没有定义列的框架

标签: pandasseleniumbeautifulsoupappendconcatenation

解决方案


通过 pandas 访问表!请参考已添加的行的评论。

states = ["Alabama", "Alaska"]

period = "2020"

num_states = len(states)

state_list = []
driver = webdriver.Chrome()
result=[] # change 1 , list to store the {state:df}
for state in states:
    
    driver.get('https://www.nbc.gov/pilt/counties.cfm')
    driver.implicitly_wait(20)
    state_s = driver.find_element(By.NAME, 'state_code')
    drp = Select(state_s)
    drp.select_by_visible_text(state)
    year_s = driver.find_element(By.NAME, 'fiscal_yr')
    drp = Select(year_s)
    drp.select_by_visible_text(period)
    driver.implicitly_wait(10)
    link = driver.find_element(By.NAME, 'Search')
    link.click()
    url = driver.current_url
    page = requests.get(url)
    temp_res={}
    soup = BeautifulSoup(driver.page_source, 'lxml')
    df_list=pd.read_html(soup.prettify(),thousands=',,') # access the table through pandas
    try:
        df_list[2].drop('PAYMENT.1', axis=1, inplace=True) # some states giving this column , so deleted
    except:
        print(f"state: {state} does have payment 1")
    try:
        df_list[2].drop('PAYMENT.2', axis=1, inplace=True)  # some states giving this column , so deleted
    except:
        print(f"state: {state} does have payment 2")
    temp_res[state]=df_list[2] # the table at occurance 2
    result.append(temp_res)

输出如下所示:

for each_run in result :
    for each_state in each_run:
        print(each_run[each_state].head(1))
 COUNTY PAYMENT TOTAL ACRES
0  AUTAUGA COUNTY  $4,971       1,758
                   COUNTY   PAYMENT TOTAL ACRES
0  ALEUTIANS EAST BOROUGH  $668,816   2,663,160

推荐阅读