首页 > 解决方案 > python中多个页面的数据连接出错

问题描述

在连接多个页面的数据并将其导出到单个 CSV 文件中时,我遇到了错误。根据我的代码,数据最多导出到第 10 页,但在第 10 页之后它正在工作。

import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os

url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx? 
hDistName=Buldhana'
chrome_path = 

r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'

d = webdriver.Chrome(executable_path=chrome_path)
d.implicitly_wait(10)
d.get(url)

Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('7') Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1464')

tableElement = d.find_element_by_id(
    'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
#print(table)
table.columns = table.iloc[0]
table = table.iloc[1:]
#print(type(table))
table = table[table.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in 

d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWi seRate [href*='Select$']")]

#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
    d.execute_script(script)
    surveys = d.find_element_by_css_selector('textarea').text
    table.iloc[[i],table.columns.get_loc('Select')] = surveys
    i += 1

print(table)

j=2
while True:
if len(d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j)))>0:

 #print( d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(i))[0].get_attribute('href'))
    d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j))[0].click()
    tableElement = d.find_element_by_css_selector( 
    "#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate")
    table1 = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
    table1.columns = table1.iloc[0]
    table1 = table1.iloc[1:]
    #print(type(table))
    table1 = table1[table1.Select == 'SurveyNo']
    #print(table) #assumption SurveyNo exists for all wanted rows
    surveyNo_scripts = [item.get_attribute('href') for item in 
    d.find_elements_by_css_selector(
                "#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate 
    [href*='Select$']")]
    #print(surveyNo_scripts)
    i = 0
    for script in surveyNo_scripts:
                    d.execute_script(script)
                    surveys = 
                    d.find_element_by_css_selector('textarea').text
                    table1.iloc[[i],table1.columns.get_loc('Select')] = 
                    surveys
                    i += 1

    #print(table1)
    #table = table.append(table1.reindex(columns=table.columns))
    table1.columns = table.columns
    table = pd.concat([table, table1] ,ignore_index=True)
    print(table)
    j+=1
else:
    break
table.to_csv(r"C:\Users\Guest\Desktop\Sample_buldhana.csv", sep=',', encoding='utf-8-sig',index = False )    

标签: python-3.xselenium-webdriverbeautifulsoup

解决方案


推荐阅读