python - 为什么 Python Selenium 经常导致页面未加载?
问题描述
这更像是一个我理解的问题(并安抚我的挫败感),而不是关于如何解决它的问题,但正如问题所述;为什么在 Selenium 上加载 URL/页面(在我的情况下是 Python)通常不会加载并引发 NoSuchElementException 错误?我了解与正常浏览一样,有时网页无法加载。但我发现 25% - 50% 的加载 URL/页面的尝试在 30 秒超时后无法正常工作,因此我必须重试多达 10 次,每次尝试之间的超时时间会增加,然后才能获得URL/页面最终加载的实例。
如果你能帮助我理解,那将不胜感激。
提前感谢您的解释。
示例代码
我目前正在试验https://www.carsales.com.au
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import mysql.connector
import time
import datetime
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1920, 1080))
display.start()
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-setuid-sandbox")
driver = webdriver.Chrome(chrome_options=chrome_options)
con = mysql.connector.connect(*****)
cursor = con.cursor()
sql_user_searches = "****"
cursor.execute(sql_user_searches)
searches = cursor.fetchall()
for z in searches:
offset = 0
url = "https://www.carsales.com.au/cars/{0}/{1}/".format(z[2],z[4],offset)
sleep_time = 5
num_retries = 100
error = 0
for loopingcow in range(0, num_retries):
try:
error = 0
driver.get(url)
time.sleep(sleep_time)
driver.find_element_by_xpath("""//*[@class="result-set-container "]""").get_attribute("outerHTML")
print("success")
except NoSuchElementException:
print("error")
error = 1
pass
if error == 1:
time.sleep(sleep_time) # wait before trying to fetch the data again
sleep_time += 1 # Implement your backoff algorithm here i.e. exponential backoff
else:
break
total_pagination = driver.find_elements_by_xpath("""//div[@class="tabbed-pagination"]/div[@class="pagination-container"]/div[@class="pagination-container"]/div[@class="pagination"]/p""")[0].text
number_of_pages_split = total_pagination.split(" ")
number_of_pages = int(number_of_pages_split[1])
page = 0
while page < number_of_pages:
offset = page * 12
url = "https://www.carsales.com.au/cars/{0}/{1}/?offset={2}".format(z[2],z[4],offset)
print(url)
sleep_time = 5
num_retries = 100
error = 0
for loopyloop in range(0, num_retries):
try:
error = 0
driver.get(url)
time.sleep(sleep_time)
driver.find_element_by_xpath("""//*[@class="result-set-container "]""").get_attribute("outerHTML")
print("success")
except NoSuchElementException:
print("error")
error = 1
pass
if error == 1:
time.sleep(sleep_time) # wait before trying to fetch the data again
sleep_time += 1 # Implement your backoff algorithm here i.e. exponential backoff
else:
break
rows = driver.find_elements_by_xpath("""//div[contains(@class,"listing-item")]""")
count = len(rows)
i = 0
while i < count:
title = rows[i].find_elements_by_xpath("""//div[contains(@class,"title ")]/a/h2""")[i].text
i = i + 1
query = """****""".format(*****)
cursor.execute(query)
con.commit()
page = page + 1
cursor.close()
con.close()
driver.quit()
display.popen.kill()
print("success")
具有 30 秒超时的第二个示例代码
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import mysql.connector
import time
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1920, 1080))
display.start()
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-setuid-sandbox")
driver = webdriver.Chrome(chrome_options=chrome_options)
date = int(time.strftime("%d"))
month = int(time.strftime("%m"))
con = mysql.connector.connect(*****)
cursor = con.cursor()
for z in range(11, 13):
if z == 9:
end_date = 31
elif z == 10:
end_date = 32
elif z == 11:
end_date = 31
elif z == 12:
end_date = 32
elif z == 8:
end_date = 32
start_date = 1
if z == month and (end_date - date) < 5:
start_date = end_date
elif z == (month + 1) and (end_date - date) < 5:
start_date = start_date + 4 - (end_date - date)
elif z > month:
start_date = 1
else:
start_date = date
print(z)
print(start_date)
print(end_date)
for x in range(start_date, end_date):
time.sleep(2)
x_url = str(x).zfill(2)
z_url = str(z).zfill(2)
date = x_url + "-" + z_url
url = "https://www.tiket.com/pesawat/cari?d=DPS&a=JKT&date=2017-{1}-{0}&adult=2&child=0&infant=0".format(x_url,z_url)
print(url)
driver.get(url)
time.sleep(30)
last_height = driver.execute_script("return document.body.scrollHeight")
print(last_height)
w = 0
while w < last_height:
print("Success")
w = last_height
try:
time.sleep(30)
print(driver.find_element_by_xpath("""//*[@id="tbody_depart"]""").get_attribute("outerHTML"))
rows = driver.find_elements_by_xpath("""//tr[contains(@id,"flight")]""")
for row in rows:
airline = row.get_attribute("data-airlinesname")
price = row.get_attribute("data-price")
departure = row.get_attribute("data-depart")
arrival = row.get_attribute("data-arrival")
baggage = row.get_attribute("data-baggage")
stops = row.get_attribute("data-stoptext")
query = """****""".format(******)
print(query)
cursor.execute(query)
con.commit()
except:
driver.get(url)
time.sleep(30)
print(driver.find_element_by_xpath("""//*[@id="tbody_depart"]""").get_attribute("outerHTML"))
rows = driver.find_elements_by_xpath("""//tr[contains(@id,"flight")]""")
for row in rows:
airline = row.get_attribute("data-airlinesname")
price = row.get_attribute("data-price")
departure = row.get_attribute("data-depart")
arrival = row.get_attribute("data-arrival")
baggage = row.get_attribute("data-baggage")
stops = row.get_attribute("data-stoptext")
query = """*****""".format(*****)
print(query)
cursor.execute(query)
con.commit()
cursor.close()
con.close()
driver.close()
display.popen.kill()
解决方案
推荐阅读
- ruby-on-rails - 如何在rails中插入has_many关联
- html - 仅使用 CSS 从路由器出口组件中隐藏页脚组件?
- javascript - 在 .then() 中显式调用 resolve()
- react-admin - 如何从文档中的主题示例访问“管理”对象?
- r - 如何使用循环或应用函数计算两个日期范围之间变量的平均值?
- cypress - 赛普拉斯:保存 API 响应中的数据以重新使用它
- unreal-engine4 - (虚幻引擎 4)有没有办法可以在运行时将骨架网格体姿势转换为静态网格体?
- .net - 如何获取 Dotnet 缓存以保留 API 控制器调用之间的值?
- mysql - 来自 Microsoft Office 插件 taskpane.js 的 MySQL 驱动程序模块的空引用
- jquery - Select2 & Bootstrap 模态问题设置选择选项