python - 提取数据限制 - Selenium 和 BS4
问题描述
你能打消我的疑虑吗?
我使用下面的代码从网站中提取数据。然而,同样在内部创建另一个函数以进行迭代和 oagination 服务器将我的提取限制在 10 K 行中。
获得超过 10 K 行的唯一方法是再次执行代码,但是我失去了这个动作的分页。
我怎样才能绕过这个限制,这是我可以使用 webdriver 的一些选项?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
login_url = 'sample.com'
data = {
'username': 'MY_USER',
'password': 'MY_PASS',
}
def openDriver():
# i put some security measures so website can't detect that u use selenium, just incase!
options = webdriver.ChromeOptions()
options.add_experimental_option("useAutomationExtension", False)
options.add_argument("--headless")
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-features=NetworkService") ##this did it for me
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-features=VizDisplayCompositor")
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(10)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'plugins', {
get: () => '[1,2,3]'
})
"""
})
return driver
# open driver
driver = openDriver()
# open login page
driver.get(login_url)
# type username
userBox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH ,'//*[@id="user_id"]')))
userBox.send_keys(data['username'])
# click on the continue button
contineButton = driver.find_element_by_xpath('//*[@id="login_user_form"]/div[2]/button')
contineButton.click()
# type password
passBox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH ,'//*[@id="password"]')))
passBox.send_keys(data['password'])
# click on the login button
loginButton = driver.find_element_by_xpath('//*[@id="action-complete"]')
loginButton.click()
# scrape url
report_final = []
url='sample.com'
i = 0
for x in range(0,300):
driver.get(url+str(i))
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
content = soup.find_all('table', class_='andes-table table table-sticky')
table = soup.find("table", { "class" : "andes-table table table-sticky" })
for row in table.findAll("tr"):
cells = row.findAll("td")
if len(cells) == 9:
td = cells[0].find(text=True)
process = cells[1].find(text=True)
tp = cells[2].find(text=True)
inventory_id = cells[3].find(text=True)
qty = cells[4].find(text=True)
origem = cells[5].find(text=True)
destino = cells[6].find(text=True)
data = cells[7].find(text=True)
movements_report = {
'td':td,
'process':process,
'tp':tp,
'inventory_id':inventory_id,
'qty':qty,
'origem':origem,
'destino':destino,
'data':data
}
report_final.append(movements_report)
df = pd.DataFrame(report_final)
df['data'] = df['data'].str.replace('\n', '|', regex=False)
df.to_csv('v11.csv', index=False)
i = i + 1000
time.sleep(3)
login_url = 'sample.com'
data = {
'username': 'MY_USER',
'password': 'MY_PASS',
}
def openDriver():
# i put some security measures so website can't detect that u use selenium, just incase!
options = webdriver.ChromeOptions()
options.add_experimental_option("useAutomationExtension", False)
options.add_argument("--headless")
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-features=NetworkService") ##this did it for me
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-features=VizDisplayCompositor")
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(10)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'plugins', {
get: () => '[1,2,3]'
})
"""
})
return driver
driver.close()
# open driver
driver = openDriver()
# open login page
driver.get(login_url)
# type username
userBox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH ,'//*[@id="user_id"]')))
userBox.send_keys(data['username'])
# click on the continue button
contineButton = driver.find_element_by_xpath('//*[@id="login_user_form"]/div[2]/button')
contineButton.click()
# type password
passBox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH ,'//*[@id="password"]')))
passBox.send_keys(data['password'])
# click on the login button
loginButton = driver.find_element_by_xpath('//*[@id="action-complete"]')
loginButton.click()
print(df.head())
print(len(report_final))
print(soup.title)
# close driver
driver.close()```
解决方案
推荐阅读
- r - 是否有 R 函数可以帮助我绘制单个节点的网络连接?
- arrays - 我需要得到预期的输出?我如何改进我的功能?C 社区的新手?
- redirect - 在 Kong API 网关中将重定向响应中的位置标头从 HTTP 更改为 HTTPS
- python - 如何使用 https://www.googleapis.com/androidpublisher/v3 从我的应用的 Play 商店评级中读取评级
- javascript - 如何添加过滤器以创建从一个栏到另一个栏的单个路径(不在循环中)
- r - 如何将三角形添加到 R 中的 ggplot2 颜色条以指示超出范围的值?
- javascript - 为什么此代码更改字符串但又恢复到原始状态
- odoo - 如何禁用其他用户的更改密码选项?
- typescript - Typescript assumes the property type to be T | Properties["key"] instead of just T
- c++ - 就地创建和填充数组