python - Python - Selenium 下一页
问题描述
我正在尝试制作一个抓取应用程序来抓取 Hants.gov.uk,现在我正在处理它,只需单击页面而不是抓取。当它到达第 1 页的最后一行时,它刚刚停止,所以我所做的是让它单击“下一页”按钮,但首先它必须返回到原始 URL。它点击第 2 页,但在第 2 页被抓取后它不会转到第 3 页,它只是重新启动第 2 页。
有人可以帮我解决这个问题吗?
代码:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
解决方案
试试这个:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
推荐阅读
- javascript - 使用jquery单击表格中的单元格文本
- html - 指定要在 CSS 类的背景属性中使用的图像
- javascript - 如何在 forEach 中嵌套 forEach 以获取 JavaScript 的总和
- python - AWS 抛出以下错误:“错误的解释器:没有这样的文件或目录”
- javascript - 如何在 ReactJS 的 onClick 事件按钮上添加单独的加载程序文件?
- swiftui - 如何清除视图框架
- python - 尝试使用 Seaborn 时出现 ImportError
- html - 当它的父div的宽度接收到全屏时,如何将文本换行?
- reactjs - React/Material-UI Slider/Leafet stopEventPropagation
- reactjs - 已获取的 json 响应的客户端存储