python - if 或 try 循环在页面 selenium 中的元素
问题描述
我正在尝试在这里抓取代理数据。我能够从第一页获取链接。我正在使用编号循环,因为我知道总页数。只要“下一页”选项存在,我就尝试运行它。我尝试了“尝试”和“如果不是”,但无法弄清楚。欢迎任何帮助。这是代码。
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links():
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
driver.refresh()
if driver.find_element_by_partial_link_text('next'):
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
new_data = driver.find_elements_by_tag_name('td')
for new in new_data:
links = new.find_elements_by_tag_name('a')
for link in links:
new_link = link.get_attribute("href")
links_total.append(new_link)
for i in range(1, 23):
first_links()
for link in links_total:
print(link)
解决方案
Try-catch 会是更好的选择
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
driver.implicitly_wait(10)
# links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links(links_total=[]):
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
# driver.refresh()
try:
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
first_links(links_total)
except (TimeoutError, ElementNotVisibleException, NoSuchElementException):
print("NEXT btn not found : ")
pass
return links_total
all_links = first_links()
for link in all_links:
print(link)
您实际上不需要使用 Selenium。你可以像这样用 BeautifulSoap 做到这一点:
import requests
from bs4 import BeautifulSoup
page_num=0
url_cbp = r"https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=&page={}"
def get_links(links_total=[], page_num=0):
page = requests.get(url_cbp.format(page_num))
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='region-content')
table_cells = results.find_all('td', class_='views-field')
for cell in table_cells:
# print(cell )
# print('\n\n')
cell_link = cell.find('a')
page_link = cell_link["href"]
links_total.append(page_link)
next_page = results.find('li', class_='pager-next')
if next_page:
page_num += 1
get_links(links_total, page_num)
return links_total
all_links = get_links()
for link in all_links:
print(link)
推荐阅读
- android-music-player - 音乐播放器应用程序无法在 android 10 上运行
- docker - 如何在 docker-compose.yml 文件中使用的 github 操作中读取环境变量?
- javascript - Javascript (Vanilla) element.checked = true/false 不起作用
- c++ - 何时使用#define 与 typedef?
- javascript - 字符串在MongoDB中进行时间转换
- spring - Spring Cloud 数据流 - 任务 - 如何传递命令行参数
- vba - 如何在 Excel Office 365 for Mac 中执行 HTTP 获取/发布请求?
- javascript - 如何从绑定函数中提取 this 属性?
- python - 如何让这个 Python 代码只输出一行?
- javascript - 添加取消按钮以取消打开窗口