首页 > 解决方案 > Convert multipule HTML to CSV file quickly in Python

问题描述

I need to extract data from multipule html files and convert them to single csv file. As the html page is absolutely unstructured, the task become tedious. Once I finished the task, the test run goes well for 1, 2 .... 10 files but after that, it starts taking long time. For 100+ files it nearly collapses. I tried 340 files. It worked but took at least 3 hours and the machine got hang at the end. Pasting the full code below with attachment to the example of html file (source code). Is there any better way to process? Note: I have already checked this answer and it doesn't help much. Thanks.

import os
from bs4 import BeautifulSoup as bs
import pandas as pd
import glob
import datetime

root_dir = r'/home/some path'
all_list = []
for newFile in glob.glob(os.path.join(root_dir, '**/*.html'), recursive=True):
    dictionary = {}
    # create soup.
    openFile = open(newFile)
    soup = bs(openFile, 'html.parser')
    # section 1: Case Details
    try:
        caseType = soup.find('span', {'class': 'case_details_table'})
        caseTypeChild = caseType.findChild()
        # ref for .next - https://stackoverflow.com/questions/5999407/extract-content-within-a-tag-with-beautifulsoup
        sessionsCase = caseTypeChild.next.next.next
        filing = sessionsCase.next.next
        filingNumberHeading = filing.find('label')
        filingNumber = filingNumberHeading.next.next
        filingDate = filingNumber.next.next.next.next
        registration = filingDate.next.next
        registrationNumberHeading = registration.find('label')
        registrationNumber = registrationNumberHeading.next.next.next
        cnrHeading = soup.find('b').find('label')
        cnrNumber = cnrHeading.next.next
        dictionary['Filing Number'] = filingNumber
        dictionary['Filing Date'] = filingDate
        dictionary['Registration Number'] = registrationNumber
        dictionary['CNR Number'] = cnrNumber
    except:
        pass

    # section 2: Case Status
    try:
        firstHearing = soup.find('strong')
        firstHearingDate = firstHearing.next_sibling.text
        dictionary['First Hearing'] = firstHearingDate
        nextHearing = soup.find('strong', text='Next Hearing Date')
        nextHearingDate = nextHearing.next_sibling.text
        dictionary['Next Hearing'] = nextHearingDate
        stageOfCase = soup.find('strong', text='Stage of Case')
        stageOfCaseText = stageOfCase.next_sibling.text
        dictionary['Stage of Case'] = stageOfCaseText
        courtNumber = soup.find('strong', text='Court Number and Judge')
        courtNumberText = courtNumber.next_sibling.next_sibling.text.strip()
        dictionary['Court Number and Judge'] = courtNumberText
    except:
        pass

    # section 6: FIR Details
    try:
        policeStationHeading = soup.find('span', attrs={'class': 'FIR_details_table'}).next.next
        policeStation = policeStationHeading.next.next.next.next
        firnumberHeading = policeStation.next.next.next
        firNumber = policeStation.find_next('label').next
        firYearHeading = firNumber.next.next.next
        firYear = firNumber.find_next('span').find_next('label').next
        # same as previous sections.
        dictionary[policeStationHeading] = policeStation
        dictionary[firnumberHeading] = firNumber
        dictionary[firYearHeading] = firYear
    except:
        pass

    # section 3: Petioner and Advocate
    try:
        petitioner = soup.find('span', attrs={'class': 'Petitioner_Advocate_table'})
        petitionerName = petitioner.next
        dictionary['Name of the Petitioner'] = petitionerName
        petitionerAdvocate = petitionerName.next.next
        dictionary['Name of the Advocate'] = petitionerAdvocate
    # section 4: Respondent and Advocate
        respondent = petitionerAdvocate.find_next('span')
        respondentName = respondent.next
        dictionary['Name of the Respondent'] = respondentName
    except:
        pass
    # section 5: Acts
    '''In this section 1. soup is prepared from act_table tab of web page
    2. Keys for main dictionary are created defining headings of acts. with 'not applied' values. 
    3. short form variables are created for names of the act. 
    4. list of acts is compared with list of variables and sections are replaced as values in the dictionary. '''

    acts = soup.select('#act_table td:nth-of-type(1)')
    sections = soup.select('#act_table td:nth-of-type(2)')
    dictionary['IPC'] = 'Not Applied'
    dictionary['PoA'] = 'Not Applied'
    dictionary['PCSO'] = 'Not Applied'
    dictionary['PCR'] = 'Not Applied'
    dictionary['Any Other Act'] = 'Not Applied'

    ipc = 'indian penal code'
    poa = 'prevention of atrocities'
    pcso = 'protection of children from sexual'
    pcr = 'protection of civil rights'


    try:
        act1 = tuple(acts[0].contents)
        sections1 = tuple(sections[0].contents)
        string = str(act1)
    except:
        pass
    try:
        act2 = tuple(acts[1].contents)
        sections2 = tuple(sections[1].contents)
    except:
        pass
    try:
        act3 = tuple(acts[2].contents)
        sections3 = tuple(sections[2].contents)
    except:
        pass
    try:
        act4 = tuple(acts[3].contents)
        sections4 = tuple(sections[3].contents)
    except:
        pass
    # using if and not for loop then actSession is not needed
    # for first act in list
    if len(acts) < 2:
        if ipc in string.lower():
            dictionary['IPC'] = sections1
        elif poa in string.lower():
            dictionary['PoA'] = sections1
        elif pcso in string.lower():
            dictionary['PCSO'] = sections1
        elif pcr in string.lower():
            dictionary['PCR'] = sections1
        else:
            pass
    # for 2nd act in list
    elif len(acts) == 2:
        if ipc in string.lower():
            dictionary['IPC'] = sections1
        elif poa in string.lower():
            dictionary['PoA'] = sections1
        elif pcso in string.lower():
            dictionary['PCSO'] = sections1
        else:
            pass
        if ipc in str(act2).lower():
            dictionary['IPC'] = sections2
        elif poa in str(act2).lower():
            dictionary['PoA'] = sections2
        elif pcso in str(act2).lower():
            dictionary['PCSO'] = sections2
        else:
            pass
    # for 3rd act in list
    elif len(acts) == 3:
        if ipc in string.lower():
            dictionary['IPC'] = sections1
        elif poa in string.lower():
            dictionary['PoA'] = sections1
        elif pcso in string.lower():
            dictionary['PCSO'] = sections1
        elif pcr in string.lower():
            dictionary['PCR'] = sections1
        else:
            pass
        if ipc in str(act2).lower():
            dictionary['IPC'] = sections2
        elif poa in str(act2).lower():
            dictionary['PoA'] = sections2
        elif pcso in str(act2).lower():
            dictionary['PCSO'] = sections2
        elif pcr in str(act2).lower():
            dictionary['PCR'] = sections2
        else:
            pass
    else:
        pass
    all_list.append(dictionary)

df = pd.DataFrame(all_list)
df = df[['CNR Number', 'Filing Number', 'Filing Date', 'First Hearing', 'Next Hearing', 'Stage of Case', 'Registration Number', 'Year', 'FIR Number', 'Police Station', 'Court Number and Judge',  'PoA', 'IPC', 'PCR', 'PCSO', 'Any Other Act', 'Name of the Petitioner', 'Name of the Advocate', 'Name of the Respondent']]
outputFile = open(os.path.join('/home/some path name/ file + str(
    datetime.datetime.now().day) + '_' + str(datetime.datetime.now().month) + '_' + str(
    datetime.datetime.now().year) + '.csv'), 'w')
df.to_csv(outputFile)
outputFile.close()

标签: pythonpandascsv

解决方案


The all_list dict is accumulating the data from all files. I recommend writing one row for each data extracts as follows

df = dictionary[['CNR Number', 'Filing Number', 'Filing Date', 'First Hearing', 'Next Hearing', 'Stage of Case', 'Registration Number', 'Year', 'FIR Number', 'Police Station', 'Court Number and Judge',  'PoA', 'IPC', 'PCR', 'PCSO', 'Any Other Act', 'Name of the Petitioner', 'Name of the Advocate', 'Name of the Respondent']]
one_case=''
for i in df:
    one_case=i+','
one_case=one_case+'\n'
outputFile = open('output.csv', 'a')
outputFile.write(one_case)
outputFile.close()

dict index are fixed therefore each case can be put in a seperate row and a comma seperated file can be appended for each processed html file.


推荐阅读