selenium - 我在抓取向下滚动页面时遇到错误
问题描述
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import NoSuchElementException
import time
import sys
import pandas as pd
import mysql.connector
mydb = mysql.connector.connect(
host="localhost",
user="danish-khan",
password="12345",
db='reseachgate_profiles'
)
cur = mydb.cursor()
#create table
cur.execute("""DROP TABLE IF EXISTS Data2""")
cur.execute(''' CREATE TABLE IF NOT EXISTS Data2
(Id INT NOT NULL PRIMARY KEY AUTO_INCREMENT,
Name varchar(255),
Institution VARCHAR(255),
Department varchar(255),
Citations INTEGER,
Recommendation INTEGER,
Total_Reads INTEGER,
Total_research_interest DECIMAL(7,1),
Research_items INTEGER,
Projects INTEGER,
Questions INTEGER,
Answers INTEGER,
Scores DECIMAL(7,1),
Followers INTEGER,
Followings INTEGER
)''')
login_url = 'https://www.researchgate.net/login'
base_url = "https://www.researchgate.net/institution/COMSATS-University-Islamabad/department/Department-of-Computer-Science/members"
chrome_driver_path = '/home/danish-khan/scrapers/researchgate/chromedriver'
chrome_options = Options()
#chrome_options.add_argument('--headless')
webdriver = webdriver.Chrome(
executable_path=chrome_driver_path, options=chrome_options
)
# default login credential and search query
username = 'your username'
password = 'your password'
results = []
total_profiles = []
with webdriver as driver:
# Set timeout time
wait = WebDriverWait(driver, 5)
# retrive url in headless browser
driver.get(login_url)
driver.find_element_by_id("input-login").send_keys(username)
driver.find_element_by_id("input-password").send_keys(password)
driver.find_element_by_class_name("nova-c-button__label").find_element(By.XPATH, "./..").click()
time.sleep(2)
driver.get(base_url)
time.sleep(3)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#names = driver.find_elements_by_css_selector('.display-name')
#name = driver.find_elements_by_xpath('//ul[@class="list people-list-m"]/li//a[@class="display-name"]')
last_height = driver.execute_script('return document.body.scrollHeight')
print('height:',last_height)
time.sleep(5)
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
#driver.execute_script("window.scrollTo(1, 5000);")
new_height = driver.execute_script("return document.body.scrollHeight")
print('new height:' +str(new_height))
if new_height == last_height:
break
last_height = new_height
total_profiles.append(last_height)
links = '//ul[@class="list people-list-m"]/li//a[@class="display-name"]'
name = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.XPATH, links))
)
print(len(name))
lenname = len(name)
total_profiles = total_profiles[0]
print('total profiles:', total_profiles)
#selector = '.display-name'
selector = '//ul[@class="list people-list-m"]/li//a[@class="display-name"]'
for i in range(0,lenname-1):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(5)
links = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, selector))
)
links[i].click()
details = {
'Name' : driver.find_element_by_css_selector('.nova-e-text--size-xl.nova-e-text--color-grey-900').text,
'Institution' : '',
'Department' : driver.find_element_by_css_selector('.nova-v-institution-item__info-section-list-item .nova-e-link--theme-bare').text,
'Citations' : '',
'Recommendation' : '',
'Total_Reads' : '',
'Total_research_interest' : '',
'Research_items' : '',
'Projects' : '',
'Questions' : '',
'Answers' : '',
'Scores' : '',
'Followings' : '',
'Followers' : ''
}
try:
Institution = driver.find_element_by_css_selector('.nova-v-institution-item__title .nova-e-link--theme-bare').text
except:
Institution = 'N/A'
try:
Citations = driver.find_element_by_css_selector('.application-box-layout__item--m:nth-child(2) .nova-e-text--size-xl').text
Citations = int(Citations.replace(",", ""))
except:
Citations = 0
try:
Recommendation = driver.find_element_by_css_selector('.application-box-layout__item--m:nth-child(3) .nova-e-text--size-xl').text
Recommendation = int(Recommendation.replace(' ', ''))
except:
Recommendation = 0
try:
Total_Reads = driver.find_element_by_css_selector('.application-box-layout__item--m:nth-child(4) .nova-e-text--size-xl').text
Total_Reads = int(Total_Reads.replace(",", ""))
except:
Total_Reads = 0
try:
Total_research_interest = driver.find_element_by_css_selector('.application-box-layout__item--m:nth-child(1) .nova-e-text--size-xl').text
Total_research_interest = float(Total_research_interest)
except:
Total_research_interest = 0.0
try:
Research_items = driver.find_element_by_css_selector('.application-box-layout__item--xs:nth-child(1) .nova-e-text--color-inherit').text
Research_items = int(Research_items)
except:
Research_items = 0
try:
Projects = driver.find_element_by_css_selector('.application-box-layout__item--xs:nth-child(2) .nova-e-text--color-inherit').text
Projects = int(Projects)
except:
Projects = 0
try:
Questions = driver.find_element_by_css_selector('.application-box-layout__item--xs:nth-child(3) .nova-e-text--size-xl').text
Questions = int(Questions)
except:
Questions = 0
try:
Answers = driver.find_element_by_css_selector('.application-box-layout__item--xs:nth-child(4) .nova-e-text--size-xl').text
Answers = int(Answers)
except:
Answers = 0
try:
Scores = driver.find_element_by_css_selector('.profile-header-details-meta-items .nova-e-list__item:nth-child(1)').text
Scores = float(Scores)
except:
Scores = 0
try:
Followings = driver.find_element_by_xpath(xpath = "//*[contains(text(), 'Following')]").text.strip('Following').strip('( )')
Followings = int(Followings)
except:
Followings = 0
try:
Followers = driver.find_element_by_xpath(xpath = "//*[contains(text(), 'Followers')]").text.strip('Followers').strip('( )')
Followers = int(Followers)
except:
Followers = 0
details['Institution'] = Institution
details['Citations'] = Citations
details['Recommendation'] = Recommendation
details['Total_Reads'] = Total_Reads
details['Total_research_interest'] = Total_research_interest
details['Research_items'] = Research_items
details['Projects'] = Projects
details['Questions'] = Questions
details['Answers'] = Answers
details['Scores'] = Scores
details['Followings'] = Followings
details['Followers'] = Followers
results.append(details)
driver.back()
time.sleep(5)
profile_details = pd.DataFrame(results)
print(profile_details)
for row in profile_details.itertuples():
cur.execute('''INSERT INTO Data2
(Name,
Institution,
Department,
Citations,
Recommendation,
Total_Reads,
Total_research_interest,
Research_items , Projects,
Questions,
Answers,
Scores,
Followers,
Followings)
VALUES
("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s", "%s" )''',
(row.Name,
row.Institution,
row.Department,
row.Citations,
row.Recommendation,
row.Total_Reads,
row.Total_research_interest,
row.Research_items,
row.Projects,
row.Questions,
row.Answers,
row.Scores,
row.Followers,
row.Followings ) )
mydb.commit()
print('complete.')
mydb.close()
time.sleep(5)
driver.close()
我正在抓取researchgate.com网站用户配置文件机构明智和机构部门明智,如果部门成员数量较少,则将其抓取,如果部门成员数量较多,则不抓取并给我错误。
resgt3.py
height: 1907
new height:2622
new height:3342
new height:4062
new height:4792
new height:5517
new height:6247
new height:6967
new height:7692
new height:8412
new height:9127
new height:9857
new height:10577
new height:11307
new height:12022
new height:12752
new height:13477
new height:14217
new height:14937
new height:15647
new height:16367
new height:17092
new height:17817
new height:18547
new height:19267
new height:20002
new height:20727
new height:21442
new height:22167
new height:22887
new height:23622
new height:24347
new height:25072
new height:25797
new height:26517
new height:27247
new height:27967
new height:28692
new height:29417
new height:30142
new height:30862
new height:31582
new height:32312
new height:33032
new height:33747
new height:34457
new height:35187
new height:35902
new height:36632
new height:37357
new height:38082
new height:38812
new height:39542
new height:40267
new height:40997
new height:41722
new height:42447
new height:43182
new height:43912
new height:44637
new height:45357
new height:46077
new height:46802
new height:47527
new height:48252
new height:48982
new height:49712
new height:50437
new height:51157
new height:51877
new height:52607
new height:53337
new height:54062
new height:54787
new height:55507
new height:56247
new height:56972
new height:57697
new height:58417
new height:59137
new height:59862
new height:60587
new height:61312
new height:62027
new height:62757
new height:63487
new height:64197
new height:64917
new height:65647
new height:66372
new height:67092
new height:67822
new height:68557
new height:69282
new height:70007
new height:70732
new height:71457
new height:72187
new height:72907
new height:73632
new height:74372
new height:75087
new height:75807
new height:76532
new height:77257
new height:77987
new height:78717
new height:79427
new height:80152
new height:80867
new height:81587
new height:82322
new height:83037
new height:83762
new height:84487
new height:85202
new height:85927
new height:86642
new height:87367
new height:88092
new height:88812
new height:89522
new height:90257
new height:90982
new height:91717
new height:92235
new height:92235
1268
total profiles: 92235
Traceback (most recent call last):
File "resgt3.py", line 121, in <module>
links[i].click()
IndexError: list index out of range
我不认为为什么会发生这种情况 这个网站就像 Linkedin 但我想知道出现此错误的原因以及任何解决方案。
解决方案
推荐阅读
- python - 当我尝试下载 PyAudio 时,我刚刚收到此错误消息。这太痛苦了。请多多关照
- mysql - 如何使用 Python 将数据从树莓派传输到 MySQL 服务器?
- ios - 将字节偏移量转换为 TestFlight 崩溃日志中的行号
- rowid - 在 mimer-sql 中获取 ROWID
- json - 如何从 JSON 对象加载数据...而不是 JSON 数组
- node.js - 如何在调用 mocha 时更改 package.json 中 mocha 的“主”文件设置?
- pandas - 获取 ValueError:连接轴的所有输入数组维度必须使用 Sklearn Pipeline、ColumnTransformer 完全匹配
- html - 如何显示 Bootstrap 4 Toast 以跨越整个容器的宽度,而不是页面?
- url - 从作业中导出仪表板 URL
- javascript - 如何根据放置在输入字段中的数字显示一组 jsx 元素。反应