首页 > 解决方案 > 由于使用python selenium之间的错误,如何继续从循环中断的地方刮取数据

问题描述

在此处输入图像描述我在 python 中使用 selenium 从 sciencedirect 网站上抓取数据。我能够抓取数据,但有时会在驱动程序中打开一个新窗口,并且在从数百篇文章中提取数据后代码被破坏。我想知道是否可以从代码被破坏的地方开始提取

#Importing libraries
import requests
import os
import json
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup  
import time
import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
sciencedirect_list=[]

options = webdriver.ChromeOptions() 

options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

driver = webdriver.Chrome(options=options, 
 executable_path=r"C:\Selenium\chromedriver_win32\chromedriver.exe")

links=['https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=1','https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=2','https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=3']

for i in links:
    driver = webdriver.Chrome(options=options, 
    executable_path=r"C:\Selenium\chromedriver_win32\chromedriver.exe")
    driver.get(i)
    sleep(4)
    accordions = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "li.accordion-panel.js-accordion-panel>button.accordion-panel-title>span")))
    for accordion in accordions:
        ActionChains(driver).move_to_element(accordion).click(accordion).perform()

    issues = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "a.anchor.js-issue-item-link.text-m span.anchor-text")))
    window0  = driver.current_window_handle
    for issue in issues:
        ActionChains(driver).key_down(Keys.CONTROL).click(issue).key_up(Keys.CONTROL).perform()
        WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
        windows_after = driver.window_handles
        window1 = [x for x in windows_after if x != window0][0]
        driver.switch_to_window(window1)
        articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "a.anchor.article-content-title.u-margin-xs-top.u-margin-s-bottom span.anchor-text")))
        windows2=driver.current_window_handle
        for article in articles:
            ActionChains(driver).key_down(Keys.CONTROL).click(article).key_up(Keys.CONTROL).perform()
            WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(3))
            windows_after1 = driver.window_handles
            window2 = driver.window_handles[2]
            driver.switch_to_window(window2)
            sleep(3)
            sa={}
            try:
                sa["title"]=driver.find_element_by_xpath('//*[@id="screen-reader-main-title"]/span').text
            except:
                print("no title")
            try:
                sa["link"]=driver.find_element_by_xpath('//*[@id="doi-link"]/a[1]').text
            except:
                print("no link")
            try:
                sa["abstract"]=driver.find_element_by_xpath('//*[@id="ab0005"]').text
            except:
                print("no abstract")
            try:
                sa["highlights"]=driver.find_element_by_xpath('//*[@id="ab0010"]').text
            except:
                print("highlights not found")
            try:
                sa["k/c"]=driver.find_element_by_xpath('//*[@id="ks0010"]').text
            except:
                print("no keywords or classifications")
            try:
                sa["c/k"]=driver.find_element_by_xpath('//*[@id="ks0005"]').text
            except:
                print("no keywords or classifications")
            try:
                sa["body"]=driver.find_element_by_xpath('//*[@id="body"]').text
            except:
                print("no body")

        sciencedirect_list.append(sa)
        driver.close()
        driver.switch_to_window(window1)

    driver.close()
    driver.switch_to_window(window0)


driver.close()

如果有人可以提供此代码的有效版本,那将非常有帮助

标签: pythonseleniumselenium-webdriverweb-scraping

解决方案


使用窗口时,如果您可以假设打开新窗口时从左到右,关闭时从右到左,那么您可以轻松切换到使用打开的最新窗口driver.switch_to_window(driver.window_handles[-1])

如果我是你,我只会在每次点击打开一个新标签时调用它,每次你这样做时driver.close()

我冒昧地对您的代码进行了一些格式化,但在此过程中,我取出了打开和关闭新标签的流程。希望它对你来说仍然是真实的。它比原始代码快得多,如果您向其中添加线程并以无头模式启动它,它实际上会非常好。

这个想法是从问题页面获取所有链接,然后获取所有文章链接,然后转到每篇文章并获取您想要的数据。

另外,我使用webdriver_manager了,因为我不想手动下载新的 chromedriver。

import traceback
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager  # pip install webdriver_manager


options = webdriver.ChromeOptions()
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
driver.set_page_load_timeout(20)

links=['https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=1',
       'https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=2',
       'https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=3']

def get_url_and_wait_for_page_load(_driver, url):
    _driver.get(url)
    WebDriverWait(driver, 30).until(
        EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'usabilla_live_button_container']")))

volume_links, issue_parse_failed = [], []  # you can reprocess failed list if you wish
for link in links:
    try:
        print "Looking for volume links @ {}".format(link)
        get_url_and_wait_for_page_load(driver, link)
        driver.execute_script('var accordions = document.getElementsByClassName("accordion-panel-title"); '
                              'for(accordion in accordions) if(accordion > 1) accordions[accordion].click();')
        volume_links += [link.get_attribute("href")
                         for link in driver.find_elements_by_xpath("//a[contains(@class, 'js-issue-item-link')]")]
        print "Total volume links: {}".format(len(volume_links))
    except:
        print "Failed to get volume links @ {}".format(link)
        issue_parse_failed.append(link)

article_links, volume_parse_failed = [], []  # you can reprocess failed list if you wish
for volume_link in volume_links:
    try:
        print "Looking for article links @ {}".format(volume_link)
        get_url_and_wait_for_page_load(driver, volume_link)
        article_links += [link.get_attribute("href")
                          for link in driver.find_elements_by_xpath("//a[contains(@class, 'article-content-title')]")]
        print "Total article links: {}".format(len(article_links))
    except:
        print traceback.format_exc()
        print "Failed to get article links @ {}".format(volume_link)
        volume_parse_failed.append(volume_link)

sciencedirect_list, article_parse_failed = [], []
for article_link in article_links:
    try:
        print "Extracting data for article @ {}".format(article_link)
        get_url_and_wait_for_page_load(driver, article_link)
        sa = {}
        for item in [{"title": '//*[@id="screen-reader-main-title"]/span',
                      "link": '//*[@id="doi-link"]/a[1]',
                      "abstract": '//*[@id="ab0005"]',
                      "highlights": '//*[@id="ab0010"]',
                      "k/c": '//*[@id="ks0010"]',
                      "c/k": '//*[@id="ks0005"]',
                      "body": '//*[@id="body"]'}]:
            for name, xpath in item.items():
                try:
                    sa[name] = driver.find_element_by_xpath(xpath).text
                except:
                    sa[name] = None
        sciencedirect_list.append(sa)
    except:
        print "Failed to extract article's data  @ {}".format(article_link)
        article_parse_failed.append(article_link)

driver.close()


推荐阅读