首页 > 解决方案 > 使用 Python Selenium 从页面中递归地抓取表格

问题描述

我正在尝试从http://hdr.undp.org/en/data抓取一张表格

根据单击下一页按钮到下一页的代码,我不断收到 timeoutexception 错误。以下代码如下:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd

driver=webdriver.Chrome(executable_path ="C:\Program Files\Google\Chrome\chromedriver.exe")
driver.get('http://hdr.undp.org/en/data')

page=1
max_page=19
Rank=[]
Country=[]
HDI_Val=[]
Life_ex=[]
Ex_schl=[]
Mean_schl=[]
GNI_pc=[]
while page<=max_page:
    
    rows= WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//table[@id='chart']/div[2]/table/tbody//tr")))
    
    for row in rows:
        Rank.append(row.find_element_by_xpath('./td[2]').text)
        Country.append(row.find_element_by_xpath('./td[3]').text)
        HDI_Val.append(row.find_element_by_xpath('./td[4]').text)
        Life_ex.append(row.find_element_by_xpath('./td[5]').get_attribute('textContent'))
        Ex_schl.append(row.find_element_by_xpath('./td[6]').text)
        Mean_schl.append(row.find_element_by_xpath('./td[7]').get_attribute('textContent'))
        GNI_pc.append(row.find_element_by_xpath('./td[8]').text)


    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[@class='next svelte-1yl7n8i']"))).click()
    page=page+1
    print('navigate to page: ' + str(page))

driver.close()

df=pd.DataFrame({"Rank":Rank,"Country":Country,"HDI_Val":HDI_Val,"Life_ex":Life_ex,"Ex_schl":Ex_schl,"Mean_schl":Mean_schl,"GNI_pc":GNI_pc})
print(df)
df.to_csv('hdi_ind.csv',index=False)

标签: pythonseleniumselenium-webdriver

解决方案


而不是处理与该 iframe 的交互,而是导航到其源

driver.implicitly_wait(5)
driver.get("http://hdr.undp.org/en/data")
iframe = [ele for ele in driver.find_elements_by_tag_name("iframe") if 
ele.get_attribute("title") == "Human Development Index (HDI) Ranking"][0]

driver.get(iframe.get_attribute("src"))

page = 1
t = [[] for _ in range(7)]
while page<=19:
    for row in driver.find_elements_by_class_name("css-kfswhc"):
        col = [col.text for col in row.find_elements_by_tag_name("td")[1:]]
        for i,mylist in enumerate(t):
            mylist.append(col[i])
    page+=1
    driver.find_element_by_class_name("next").click()

df=pd.DataFrame({"Rank":t[0],"Country":t[1],"HDI_Val":t[2],"Life_ex":t[3],"Ex_schl":t[4],"Mean_schl":t[5],"GNI_pc":t[6]})
print(df)

推荐阅读