python - Convert multipule HTML to CSV file quickly in Python
问题描述
I need to extract data from multipule html files and convert them to single csv file. As the html page is absolutely unstructured, the task become tedious. Once I finished the task, the test run goes well for 1, 2 .... 10 files but after that, it starts taking long time. For 100+ files it nearly collapses. I tried 340 files. It worked but took at least 3 hours and the machine got hang at the end. Pasting the full code below with attachment to the example of html file (source code). Is there any better way to process? Note: I have already checked this answer and it doesn't help much. Thanks.
import os
from bs4 import BeautifulSoup as bs
import pandas as pd
import glob
import datetime
root_dir = r'/home/some path'
all_list = []
for newFile in glob.glob(os.path.join(root_dir, '**/*.html'), recursive=True):
dictionary = {}
# create soup.
openFile = open(newFile)
soup = bs(openFile, 'html.parser')
# section 1: Case Details
try:
caseType = soup.find('span', {'class': 'case_details_table'})
caseTypeChild = caseType.findChild()
# ref for .next - https://stackoverflow.com/questions/5999407/extract-content-within-a-tag-with-beautifulsoup
sessionsCase = caseTypeChild.next.next.next
filing = sessionsCase.next.next
filingNumberHeading = filing.find('label')
filingNumber = filingNumberHeading.next.next
filingDate = filingNumber.next.next.next.next
registration = filingDate.next.next
registrationNumberHeading = registration.find('label')
registrationNumber = registrationNumberHeading.next.next.next
cnrHeading = soup.find('b').find('label')
cnrNumber = cnrHeading.next.next
dictionary['Filing Number'] = filingNumber
dictionary['Filing Date'] = filingDate
dictionary['Registration Number'] = registrationNumber
dictionary['CNR Number'] = cnrNumber
except:
pass
# section 2: Case Status
try:
firstHearing = soup.find('strong')
firstHearingDate = firstHearing.next_sibling.text
dictionary['First Hearing'] = firstHearingDate
nextHearing = soup.find('strong', text='Next Hearing Date')
nextHearingDate = nextHearing.next_sibling.text
dictionary['Next Hearing'] = nextHearingDate
stageOfCase = soup.find('strong', text='Stage of Case')
stageOfCaseText = stageOfCase.next_sibling.text
dictionary['Stage of Case'] = stageOfCaseText
courtNumber = soup.find('strong', text='Court Number and Judge')
courtNumberText = courtNumber.next_sibling.next_sibling.text.strip()
dictionary['Court Number and Judge'] = courtNumberText
except:
pass
# section 6: FIR Details
try:
policeStationHeading = soup.find('span', attrs={'class': 'FIR_details_table'}).next.next
policeStation = policeStationHeading.next.next.next.next
firnumberHeading = policeStation.next.next.next
firNumber = policeStation.find_next('label').next
firYearHeading = firNumber.next.next.next
firYear = firNumber.find_next('span').find_next('label').next
# same as previous sections.
dictionary[policeStationHeading] = policeStation
dictionary[firnumberHeading] = firNumber
dictionary[firYearHeading] = firYear
except:
pass
# section 3: Petioner and Advocate
try:
petitioner = soup.find('span', attrs={'class': 'Petitioner_Advocate_table'})
petitionerName = petitioner.next
dictionary['Name of the Petitioner'] = petitionerName
petitionerAdvocate = petitionerName.next.next
dictionary['Name of the Advocate'] = petitionerAdvocate
# section 4: Respondent and Advocate
respondent = petitionerAdvocate.find_next('span')
respondentName = respondent.next
dictionary['Name of the Respondent'] = respondentName
except:
pass
# section 5: Acts
'''In this section 1. soup is prepared from act_table tab of web page
2. Keys for main dictionary are created defining headings of acts. with 'not applied' values.
3. short form variables are created for names of the act.
4. list of acts is compared with list of variables and sections are replaced as values in the dictionary. '''
acts = soup.select('#act_table td:nth-of-type(1)')
sections = soup.select('#act_table td:nth-of-type(2)')
dictionary['IPC'] = 'Not Applied'
dictionary['PoA'] = 'Not Applied'
dictionary['PCSO'] = 'Not Applied'
dictionary['PCR'] = 'Not Applied'
dictionary['Any Other Act'] = 'Not Applied'
ipc = 'indian penal code'
poa = 'prevention of atrocities'
pcso = 'protection of children from sexual'
pcr = 'protection of civil rights'
try:
act1 = tuple(acts[0].contents)
sections1 = tuple(sections[0].contents)
string = str(act1)
except:
pass
try:
act2 = tuple(acts[1].contents)
sections2 = tuple(sections[1].contents)
except:
pass
try:
act3 = tuple(acts[2].contents)
sections3 = tuple(sections[2].contents)
except:
pass
try:
act4 = tuple(acts[3].contents)
sections4 = tuple(sections[3].contents)
except:
pass
# using if and not for loop then actSession is not needed
# for first act in list
if len(acts) < 2:
if ipc in string.lower():
dictionary['IPC'] = sections1
elif poa in string.lower():
dictionary['PoA'] = sections1
elif pcso in string.lower():
dictionary['PCSO'] = sections1
elif pcr in string.lower():
dictionary['PCR'] = sections1
else:
pass
# for 2nd act in list
elif len(acts) == 2:
if ipc in string.lower():
dictionary['IPC'] = sections1
elif poa in string.lower():
dictionary['PoA'] = sections1
elif pcso in string.lower():
dictionary['PCSO'] = sections1
else:
pass
if ipc in str(act2).lower():
dictionary['IPC'] = sections2
elif poa in str(act2).lower():
dictionary['PoA'] = sections2
elif pcso in str(act2).lower():
dictionary['PCSO'] = sections2
else:
pass
# for 3rd act in list
elif len(acts) == 3:
if ipc in string.lower():
dictionary['IPC'] = sections1
elif poa in string.lower():
dictionary['PoA'] = sections1
elif pcso in string.lower():
dictionary['PCSO'] = sections1
elif pcr in string.lower():
dictionary['PCR'] = sections1
else:
pass
if ipc in str(act2).lower():
dictionary['IPC'] = sections2
elif poa in str(act2).lower():
dictionary['PoA'] = sections2
elif pcso in str(act2).lower():
dictionary['PCSO'] = sections2
elif pcr in str(act2).lower():
dictionary['PCR'] = sections2
else:
pass
else:
pass
all_list.append(dictionary)
df = pd.DataFrame(all_list)
df = df[['CNR Number', 'Filing Number', 'Filing Date', 'First Hearing', 'Next Hearing', 'Stage of Case', 'Registration Number', 'Year', 'FIR Number', 'Police Station', 'Court Number and Judge', 'PoA', 'IPC', 'PCR', 'PCSO', 'Any Other Act', 'Name of the Petitioner', 'Name of the Advocate', 'Name of the Respondent']]
outputFile = open(os.path.join('/home/some path name/ file + str(
datetime.datetime.now().day) + '_' + str(datetime.datetime.now().month) + '_' + str(
datetime.datetime.now().year) + '.csv'), 'w')
df.to_csv(outputFile)
outputFile.close()
解决方案
The all_list dict is accumulating the data from all files. I recommend writing one row for each data extracts as follows
df = dictionary[['CNR Number', 'Filing Number', 'Filing Date', 'First Hearing', 'Next Hearing', 'Stage of Case', 'Registration Number', 'Year', 'FIR Number', 'Police Station', 'Court Number and Judge', 'PoA', 'IPC', 'PCR', 'PCSO', 'Any Other Act', 'Name of the Petitioner', 'Name of the Advocate', 'Name of the Respondent']]
one_case=''
for i in df:
one_case=i+','
one_case=one_case+'\n'
outputFile = open('output.csv', 'a')
outputFile.write(one_case)
outputFile.close()
dict index are fixed therefore each case can be put in a seperate row and a comma seperated file can be appended for each processed html file.
推荐阅读
- r - 在 stat_summary 中设置位置时出错
- python - Django 从 URL 获取文件并使用 S3 将其保存到模型 FileField
- json - 是否可以“注入”对 JSON 模式的引用
- flutter - DioErrorType.RESPONSE:Http 状态错误 [500] (Flutter)
- oracle - 在 oracle 触发器中执行过程期间出现“未找到数据”错误
- c - C如何转换为十六进制整数
- python - 使用多处理、池和读取文件的 Control-C 处理
- c++ - 带红黑树的自由列表分配器
- firebase - 如何将图像上传到 web (firestore) - Flutter web
- python - 蟒蛇井字游戏