python - 点击后等待数据表加载/ Selenium
问题描述
我正在尝试使用 selenium/python 从印度中央污染控制委员会读取数据表。这是一个输出示例。我基本上遵循此处介绍的方法: https ://github.com/RachitKamdar/Python-Scraper 。
感谢@Prophet,我能够从第一页读取数据(使用 XPATH 和 Python 选择元素?)但是当切换到第 2 页时,我无法让 selenium 等待数据表重新加载。我尝试添加 webdriverwait 指令但这似乎确实有效。任何帮助将不胜感激。谢谢
这是我试图做的
browser.find_element_by_tag_name("select").send_keys("100")
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
browser.find_element(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a[contains(text(),'{}')]".format(i)).click()
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
#this works ok for page 1
#this does not wait after the click for the data table to update. As a result res is wrong for page 2 [empty].
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
...
i = i + 1
更新1:根据先知的建议,我做了以下修改:
browser.find_element_by_tag_name("select").send_keys("100")
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a")[-1].text)
print(maxpage)
i = 1
while i < maxpage + 1:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
browser.find_element(By.XPATH,'//a[@class="paginate_button next"]').click()
这仍然在第 2 页崩溃(数据为空)。此外,数据应包含第 1 页的 100 个项目,但仅包含 10 个。最大页数是正确的 (15)。
更新 2:
这是合并 Prophet 建议后的整个脚本 [原始脚本遵循 https://github.com/RachitKamdar/Python-Scraper]。这仅从第一页检索 10 个点,并且无法切换到下一页。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
def getValsHtml(table):
data = []
heads = table.find_all('th')
data.append([ele.text.strip() for ele in heads])
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols]) # Get rid of empty values
data.pop(1)
data = pd.DataFrame(data[1:],columns = data[0])
return data
def parameters(br,param):
br.find_element_by_class_name("list-filter").find_element_by_tag_name("input").send_keys(param)
br.find_elements_by_class_name("pure-checkbox")[1].click()
br.find_element_by_class_name("list-filter").find_element_by_tag_name("input").clear()
timeout = 60
url = 'https://app.cpcbccr.com/ccr/#/caaqm-dashboard-all/caaqm-landing/data'
chdriverpath="/net/f1p/my_soft/chromedriver"
option = webdriver.ChromeOptions()
browser = webdriver.Chrome(executable_path="{}".format(chdriverpath), chrome_options=option)
browser.get(url)
station="Secretariat, Amaravati - APPCB"
state="Andhra Pradesh"
city="Amaravati"
sd=['01', 'Jan', '2018']
ed=['31', 'Dec', '2021']
duration="24 Hours"
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME,"toggle")))
browser.find_elements_by_class_name("toggle")[0].click()
browser.find_element_by_tag_name("input").send_keys(state)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[1].click()
browser.find_element_by_tag_name("input").send_keys(city)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[2].click()
browser.find_element_by_tag_name("input").send_keys(station)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[4].click()
browser.find_element_by_class_name("filter").find_element_by_tag_name("input").send_keys(duration)
browser.find_element_by_class_name("options").click()
browser.find_element_by_class_name("c-btn").click()
for p in ['NH3']:
print(p)
try:
parameters(browser,p)
except:
print("miss")
browser.find_element_by_class_name("list-filter").find_element_by_tag_name("input").clear()
pass
browser.find_element_by_class_name("wc-date-container").click()
browser.find_element_by_class_name("month-year").click()
browser.find_element_by_id("{}".format(sd[1].upper())).click()
browser.find_element_by_class_name("year-dropdown").click()
browser.find_element_by_id("{}".format(int(sd[2]))).click()
browser.find_element_by_xpath('//span[text()="{}"]'.format(int(sd[0]))).click()
browser.find_elements_by_class_name("wc-date-container")[1].click()
browser.find_elements_by_class_name("month-year")[1].click()
browser.find_elements_by_id("{}".format(ed[1].upper()))[1].click()
browser.find_elements_by_class_name("year-dropdown")[1].click()
browser.find_element_by_id("{}".format(int(ed[2]))).click()
browser.find_elements_by_xpath('//span[text()="{}"]'.format(int(ed[0])))[1].click()
browser.find_elements_by_tag_name("button")[-1].click()
next_page_btn_xpath = '//a[@class="paginate_button next"]'
actions = ActionChains(browser)
#This is how you should treat the Select drop down
select = Select(browser.find_element_by_tag_name("select"))
select.select_by_value('100')
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[@class="dataTables_wrapper no-footer"]')))
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
#scroll to the next page btn and then click it
next_page_btn = browser.find_element_by_xpath(next_page_btn_xpath)
actions.move_to_element(next_page_btn).perform()
browser.find_element(By.XPATH,next_page_btn).click()
browser.quit()
解决方案
代替
browser.find_element(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a[contains(text(),'{}')]".format(i)).click()
尝试单击此元素:
browser.find_element(By.XPATH,'//a[@class="paginate_button next"]').click()
它只是下一页按钮,它不会改变您所在的每一页。
此外,而不是
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
尝试这个
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[@class="dataTables_wrapper no-footer"]')))
此元素对于所有页面都是相同的,而您尝试使用的只是为第一页定义的。
UPD
正确的代码应该是这样的:
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
next_page_btn_xpath = '//a[@class="paginate_button next"]'
actions = ActionChains(driver)
#This is how you should treat the Select drop down
select = Select(driver.find_element_by_tag_name("select"))
select.select_by_value('100')
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[@class="dataTables_wrapper no-footer"]')))
maxpage = int(browser.find_elements(By.XPATH,"//*[@id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
#scroll to the next page btn and then click it
next_page_btn = driver.find_element_by_xpath(next_page_btn_xpath)
actions.move_to_element(next_page_btn).perform()
browser.find_element(By.XPATH,next_page_btn).click()
推荐阅读
- terraform-provider-aws - 为什么已经创建的实例被删除并重新创建?
- jquery - 如何正确切换兄弟元素和相关子元素?
- javascript - 在 reactjs 中使用 forEach 读取数组元素
- mongodb - 如何使用 mongodb 在 lambda 函数中组合两个输出?
- python - 我无法在 django 中成功迁移和运行我的服务器
- r - 对我的数据进行子集化并没有产生预期的结果
- bash - 如何在我的 bash 脚本函数中使用我自己的选项
- java - LibGDX / Box2D:制作 KinematicBody 移动平台
- javascript - NoSuchElementError:没有这样的元素:无法找到元素:
- python - 情节背景颜色