首页 > 解决方案 > if 或 try 循环在页面 selenium 中的元素

问题描述

我正在尝试在这里抓取代理数据。我能够从第一页获取链接。我正在使用编号循环,因为我知道总页数。只要“下一页”选项存在,我就尝试运行它。我尝试了“尝试”和“如果不是”,但无法弄清楚。欢迎任何帮助。这是代码。

from selenium import webdriver
import time

from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)


links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links():
    initial_data = driver.find_elements_by_tag_name('td')
    for initial in initial_data:
        page_links = initial.find_elements_by_tag_name('a')
        for page in page_links:
            page_link = page.get_attribute("href")
            links_total.append(page_link)
    driver.refresh()
    if driver.find_element_by_partial_link_text('next'):
        next_page = driver.find_element_by_partial_link_text('next')
        next_page.click()
        time.sleep(2)
        new_data = driver.find_elements_by_tag_name('td')
        for new in new_data:
            links = new.find_elements_by_tag_name('a')
            for link in links:
                new_link = link.get_attribute("href")
                links_total.append(new_link)



for i in range(1, 23):
    first_links()


for link in links_total:
    print(link)

标签: pythonpython-3.xseleniumweb-scraping

解决方案


Try-catch 会是更好的选择

from selenium import webdriver
import time

from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)

driver.implicitly_wait(10)
# links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")

def first_links(links_total=[]):
    initial_data = driver.find_elements_by_tag_name('td')
    for initial in initial_data:
        page_links = initial.find_elements_by_tag_name('a')
        for page in page_links:
            page_link = page.get_attribute("href")
            links_total.append(page_link)
    # driver.refresh()
    try:
      next_page = driver.find_element_by_partial_link_text('next')
      next_page.click()
      time.sleep(2)
      first_links(links_total) 
    except (TimeoutError, ElementNotVisibleException, NoSuchElementException):
      print("NEXT btn not found : ")
      pass

    return links_total

all_links = first_links()

for link in all_links:
    print(link)

您实际上不需要使用 Selenium。你可以像这样用 BeautifulSoap 做到这一点:

import requests
from bs4 import BeautifulSoup

page_num=0
url_cbp = r"https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=&page={}"

def get_links(links_total=[], page_num=0):
  page = requests.get(url_cbp.format(page_num))
  soup = BeautifulSoup(page.content, 'html.parser')
  results = soup.find(id='region-content')

  table_cells = results.find_all('td', class_='views-field')
  for cell in table_cells:
    # print(cell )
    # print('\n\n')
    cell_link = cell.find('a')
    page_link = cell_link["href"]
    links_total.append(page_link)

  next_page = results.find('li', class_='pager-next')

  if next_page:
    page_num += 1
    get_links(links_total, page_num)

  return links_total

all_links = get_links()

for link in all_links:
  print(link)   

推荐阅读