首页 > 解决方案 > 我在抓取向下滚动页面时遇到错误

问题描述

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import NoSuchElementException 
import time
import sys
import pandas as pd
import mysql.connector


mydb = mysql.connector.connect(
  host="localhost",
  user="danish-khan",
  password="12345",
  db='reseachgate_profiles'
)

cur = mydb.cursor()


#create table
cur.execute("""DROP TABLE IF EXISTS Data2""")

cur.execute(''' CREATE TABLE IF NOT EXISTS Data2
               (Id INT NOT NULL PRIMARY KEY AUTO_INCREMENT,
                Name varchar(255),
                Institution VARCHAR(255),
                Department varchar(255),
                Citations INTEGER,
                Recommendation INTEGER, 
                Total_Reads INTEGER, 
                Total_research_interest DECIMAL(7,1), 
                Research_items INTEGER,
                Projects INTEGER, 
                Questions  INTEGER,
                Answers INTEGER, 
                Scores  DECIMAL(7,1),
                Followers INTEGER,
                Followings INTEGER
               )''')



login_url = 'https://www.researchgate.net/login'
base_url = "https://www.researchgate.net/institution/COMSATS-University-Islamabad/department/Department-of-Computer-Science/members"
chrome_driver_path = '/home/danish-khan/scrapers/researchgate/chromedriver'

chrome_options = Options()
#chrome_options.add_argument('--headless')

webdriver = webdriver.Chrome(
  executable_path=chrome_driver_path, options=chrome_options
)

# default login credential and search query
username = 'your username'
password = 'your password'

results = []
total_profiles = []

with webdriver as driver:
    # Set timeout time 
    wait = WebDriverWait(driver, 5)

    # retrive url in headless browser
    driver.get(login_url)
    
    driver.find_element_by_id("input-login").send_keys(username)
    driver.find_element_by_id("input-password").send_keys(password)
    driver.find_element_by_class_name("nova-c-button__label").find_element(By.XPATH, "./..").click()
    time.sleep(2)

    driver.get(base_url)

    time.sleep(3)
    #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    #names = driver.find_elements_by_css_selector('.display-name')
    #name = driver.find_elements_by_xpath('//ul[@class="list people-list-m"]/li//a[@class="display-name"]')
    last_height = driver.execute_script('return document.body.scrollHeight')
    print('height:',last_height)
    time.sleep(5)
   
    while True:
       # Scroll down to bottom
      driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
      time.sleep(2)
      
      #driver.execute_script("window.scrollTo(1, 5000);")
      new_height = driver.execute_script("return document.body.scrollHeight")
      print('new height:' +str(new_height))
      if new_height == last_height:
          break
      last_height = new_height    
    total_profiles.append(last_height)
    links = '//ul[@class="list people-list-m"]/li//a[@class="display-name"]'
    name = WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.XPATH, links))
              )
    print(len(name))
    lenname = len(name)
    total_profiles = total_profiles[0]
    print('total  profiles:', total_profiles)
    #selector = '.display-name'
    selector = '//ul[@class="list people-list-m"]/li//a[@class="display-name"]'
   
    for i in range(0,lenname-1):
            driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
        
            time.sleep(5)
            links = WebDriverWait(driver, 30).until(
            EC.presence_of_all_elements_located((By.XPATH, selector))
              )
                     
            links[i].click()

            details = {
                'Name' : driver.find_element_by_css_selector('.nova-e-text--size-xl.nova-e-text--color-grey-900').text,
                'Institution' : '',
                'Department' : driver.find_element_by_css_selector('.nova-v-institution-item__info-section-list-item .nova-e-link--theme-bare').text,
                'Citations' : '',
                'Recommendation' : '',
                'Total_Reads' : '',
                'Total_research_interest' : '',
                'Research_items' : '',
                'Projects' : '',
                'Questions' : '',
                'Answers' : '',
                'Scores' : '',
                'Followings' : '',
                'Followers' : ''
        
             }
    
    
            try:
              Institution = driver.find_element_by_css_selector('.nova-v-institution-item__title .nova-e-link--theme-bare').text
            except:
              Institution = 'N/A'
            
            try:   
              Citations = driver.find_element_by_css_selector('.application-box-layout__item--m:nth-child(2) .nova-e-text--size-xl').text
              Citations = int(Citations.replace(",", ""))
            except:
              Citations = 0 
                 
            try:     
              Recommendation =   driver.find_element_by_css_selector('.application-box-layout__item--m:nth-child(3) .nova-e-text--size-xl').text
              Recommendation = int(Recommendation.replace(' ', ''))
            except:
              Recommendation = 0 
            
            try:  
              Total_Reads = driver.find_element_by_css_selector('.application-box-layout__item--m:nth-child(4) .nova-e-text--size-xl').text                      
              Total_Reads = int(Total_Reads.replace(",", ""))
            except:
              Total_Reads = 0 
            
            try:
              Total_research_interest =   driver.find_element_by_css_selector('.application-box-layout__item--m:nth-child(1) .nova-e-text--size-xl').text           
              Total_research_interest = float(Total_research_interest)
            except:
              Total_research_interest = 0.0
            
            try:
              Research_items = driver.find_element_by_css_selector('.application-box-layout__item--xs:nth-child(1) .nova-e-text--color-inherit').text
              Research_items = int(Research_items)
            except:
              Research_items = 0
            
            try:
              Projects = driver.find_element_by_css_selector('.application-box-layout__item--xs:nth-child(2) .nova-e-text--color-inherit').text
              Projects = int(Projects)
            except:
              Projects = 0

            try:   
              Questions = driver.find_element_by_css_selector('.application-box-layout__item--xs:nth-child(3) .nova-e-text--size-xl').text
              Questions = int(Questions)
            except:
              Questions = 0
            
            try:
              Answers = driver.find_element_by_css_selector('.application-box-layout__item--xs:nth-child(4) .nova-e-text--size-xl').text
              Answers = int(Answers)
            except:
              Answers = 0
            
            
            try:
                Scores = driver.find_element_by_css_selector('.profile-header-details-meta-items .nova-e-list__item:nth-child(1)').text
                
                Scores = float(Scores)
            except: 
                Scores = 0
            
            try:                             
              Followings = driver.find_element_by_xpath(xpath = "//*[contains(text(), 'Following')]").text.strip('Following').strip('( )')
              
              Followings = int(Followings)
            except:
              Followings = 0
            
            try:                             
              Followers = driver.find_element_by_xpath(xpath = "//*[contains(text(), 'Followers')]").text.strip('Followers').strip('( )')

              Followers = int(Followers)
            except:
              Followers = 0  
            
            
            
            details['Institution'] = Institution 
            details['Citations'] =  Citations
            details['Recommendation'] =  Recommendation
            details['Total_Reads'] = Total_Reads 
            details['Total_research_interest'] = Total_research_interest 
            details['Research_items'] =  Research_items
            details['Projects'] = Projects 
            details['Questions'] = Questions 
            details['Answers'] =  Answers 
            details['Scores'] =  Scores
            details['Followings'] = Followings
            details['Followers'] = Followers
             
            results.append(details)
            driver.back()

            time.sleep(5)
            

profile_details = pd.DataFrame(results)
print(profile_details)

for row in profile_details.itertuples():
            
    cur.execute('''INSERT INTO Data2
                (Name,
                Institution,
                Department,
                Citations,
                Recommendation,
                Total_Reads,
                Total_research_interest,
                Research_items , Projects,
                Questions,
                Answers,
                Scores,
                Followers, 
                Followings)
                VALUES
                ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s", "%s" )''',
                (row.Name,
                row.Institution,
                row.Department,
                row.Citations,
                row.Recommendation,
                row.Total_Reads,
                row.Total_research_interest,
                row.Research_items,
                row.Projects,
                row.Questions,
                row.Answers,
                row.Scores,
                row.Followers,
                row.Followings ) )
                  
            
mydb.commit()
        
print('complete.')
  

mydb.close()
time.sleep(5)

driver.close()

我正在抓取researchgate.com网站用户配置文件机构明智和机构部门明智,如果部门成员数量较少,则将其抓取,如果部门成员数量较多,则不抓取并给我错误。

 resgt3.py
height: 1907
new height:2622
new height:3342
new height:4062
new height:4792
new height:5517
new height:6247
new height:6967
new height:7692
new height:8412
new height:9127
new height:9857
new height:10577
new height:11307
new height:12022
new height:12752
new height:13477
new height:14217
new height:14937
new height:15647
new height:16367
new height:17092
new height:17817
new height:18547
new height:19267
new height:20002
new height:20727
new height:21442
new height:22167
new height:22887
new height:23622
new height:24347
new height:25072
new height:25797
new height:26517
new height:27247
new height:27967
new height:28692
new height:29417
new height:30142
new height:30862
new height:31582
new height:32312
new height:33032
new height:33747
new height:34457
new height:35187
new height:35902
new height:36632
new height:37357
new height:38082
new height:38812
new height:39542
new height:40267
new height:40997
new height:41722
new height:42447
new height:43182
new height:43912
new height:44637
new height:45357
new height:46077
new height:46802
new height:47527
new height:48252
new height:48982
new height:49712
new height:50437
new height:51157
new height:51877
new height:52607
new height:53337
new height:54062
new height:54787
new height:55507
new height:56247
new height:56972
new height:57697
new height:58417
new height:59137
new height:59862
new height:60587
new height:61312
new height:62027
new height:62757
new height:63487
new height:64197
new height:64917
new height:65647
new height:66372
new height:67092
new height:67822
new height:68557
new height:69282
new height:70007
new height:70732
new height:71457
new height:72187
new height:72907
new height:73632
new height:74372
new height:75087
new height:75807
new height:76532
new height:77257
new height:77987
new height:78717
new height:79427
new height:80152
new height:80867
new height:81587
new height:82322
new height:83037
new height:83762
new height:84487
new height:85202
new height:85927
new height:86642
new height:87367
new height:88092
new height:88812
new height:89522
new height:90257
new height:90982
new height:91717
new height:92235
new height:92235
1268
total  profiles: 92235
Traceback (most recent call last):
  File "resgt3.py", line 121, in <module>
    links[i].click()
IndexError: list index out of range

我不认为为什么会发生这种情况 这个网站就像 Linkedin 但我想知道出现此错误的原因以及任何解决方案。

标签: seleniumweb-scraping

解决方案


推荐阅读