python - 处理非结构化数据——Python panda
问题描述
我已经使用 pdf、text 和 word 文件创建了以下数据表。我已经使用基于字典的方法将各种标题分类到一个模块中。现在我需要从句子中获取值并结构化数据。
目前这是数据的外观:
header Module
ADDITIONAL BONUS Payroll Pay Element
EMPLOYEE ID: 4564576 Employee ID
AMOUNT: 1200 USD Amount
EMPLOYEE NAME: ANDY CHEN Employee Name
SPOT AWARD Payroll Pay Element
EMPLOYEE ID: 7463453 Employee ID
AMOUNT: 200 USD Amount
EMPLOYEE NAME: MICHAEL HISHAM Employee Name
REALOCATION BONUS Payroll Pay Element
EMPLOYEE ID: 7467673 Employee ID
AMOUNT: 1400 USD Amount
EMPLOYEE NAME: AYMAN HISHAM Employee Name
REALOCATION BONUS Payroll Pay Element
EMPLOYEE ID: 7467673 Employee ID
AMOUNT: 1400 USD Amount
EMPLOYEE NAME: AYMAN HISHAM Employee Name
不确定如何以结构化格式实现所需的输出,如下所示:
Employee ID Employee Name Payroll Pay Element Amount
4564576 ANDY CHEN ADDITIONAL BONUS 1200 USD
7463453 MICHAEL HISHAM SPOT AWARD 200 USD
7467673 AYMAN HISHAM REALOCATION BONUS 1400 USD
7467673 AYMAN HISHAM REALOCATION BONUS 1400 USD
以下代码我用来从各种文档中获取第一个表数据。
import re,os, subprocess, pandas as pd, numpy as np
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk
####################################PATH TO SAVE FILES###########################################
os.chdir(r"C:\Analytics\Microsoft\One_Time_Payment")
###################Converting CVs from Various format to txt Format ###########
path_PDF = r"\pdf"
path_DOC = r"\word"
path_TXT = r"\txt"
path_ERROR=r"\ERROR"
try:
with subprocess.Popen([os.path.join(path_DOC, "docto.exe"), '-f', path_DOC, '-O', path_TXT,'-T','wdFormatTEXT'], stdout=subprocess.PIPE) as proc:
print(proc.stdout.read())
except Exception:
pass
all_pdf = os.listdir(path_PDF)
for each_file in all_pdf:
print("DEL : ",each_file)
print("Test above")
if each_file.endswith('.pdf'):
# cnd_id = (re.findall('\d+', each_file ))
txt_filename = each_file.partition('.')[0] + ".txt"
subprocess.call([os.path.join(path_PDF, "pdftotext.exe"), '-raw', '-eol', 'dos','-layout',
os.path.join(path_PDF, each_file),
os.path.join(path_TXT, txt_filename)])
#####################Processing texts ###########################################
counter = 1
all_text = os.listdir(path_TXT)
#all_text = [os.path.splitext(filename)[0] for filename in os.listdir(path_TXT)]
# only in case extra file is gettign created
#error_file=pd.DataFrame(index=['sno'],columns=['msg'])
for each_file in all_text:
try:
print(os.path.join(path_TXT, each_file))
inpdata = pd.read_table(os.path.join(path_TXT, each_file), header=None,encoding = "ISO-8859-1",sep='\n')
# cnd_id = (re.findall('\d{6}', each_file ))
except Exception as e1:
try:
inpdata = pd.read_table(os.path.join(path_TXT, each_file), header=None, engine='python',encoding = "ISO-8859-1",sep='\n',error_bad_lines=False)
except Exception as ec:
print("Python Engine E ::>> ",ec)
str_err = "Could not process the file %s" % (each_file)
print(str_err)
continue
# try:
# inpdata = pd.read_table(os.path.join(path_TXT, each_file), header=None,encoding = "ISO-8859-1",sep='\n', error_bad_lines=False)
# except Exception as ec:
# error_file["msg"]=str_err
inpdata.columns = ["Sentences"]
inpdata['Sentences'] = inpdata['Sentences'].str.upper()
# inpdata['Module'] = ''
inpdata_bck = inpdata
inpdata_bck['sent_id'] = range(0,len(inpdata_bck))
# inpdata_bck['cnd_id'] = str(cnd_id)
# inpdata['num_words'] = 0
inpdata['CID'] = each_file.partition('.')[0]
inpdata.dropna(subset=["Sentences"], inplace=True)
inpdata = inpdata.reset_index()
del inpdata['index']
inpdata_bck.to_csv(each_file + '.csv')
if (counter==1):
all_info = inpdata
# err_file=error_file
# err_file1=error_file1
else:
all_info = pd.concat([all_info,inpdata])
# err_file=pd.concat([err_file,error_file])
# err_file1=pd.concat([err_file1,error_file1])
counter = counter + 1
#Removing Special Characteres to avoid import #Name issue
all_info['Sentences']= all_info['Sentences'].map(lambda x: x.lstrip('+-'))
all_info = all_info.reset_index()
del all_info['index']
mod = pd.read_excel(r"Payroll_dict.xlsx", sheetname='Payroll Dictionary')
all_info['num_words'] = 0
all_info['Module'] = ''
all_info = all_info[pd.notnull(all_info['Sentences'])]
all_info = all_info.reset_index()
del all_info['index']
categoryDict = {}
modules = list(set(mod['Module']))
for module in modules:
categoryDict[module] = mod.loc[mod['Module'] == module, 'Header']
#Function for Module Assignment
def module_assign(inpdata):
######################### Module Creation ##############################
curr_sent_value=inpdata['Sentences']
if inpdata['num_words'] <= 0:
list1=["EMPLOYEE ID:", "REFERENCE ID"]
if curr_sent_value in list1:
return 'Employee ID'
list2=["Amount", "AMOUNTS"]
if curr_sent_value in list2:
return 'Amount'
else:
for key in categoryDict.keys():
for word in categoryDict[key]:
match = re.findall(word,curr_sent_value,re.I|re.M)
if len(match)>0:
return key
for CID, cid_data in all_info.groupby('CID'):
inpdata=all_info
inpdata = inpdata.reset_index()
# del inpdata['index']
inpdata['header'] = " "
# inpdata['clean_sent']=inpdata['Sentences'].map(reg_clean)
inpdata['num_words']=inpdata['Sentences'].map(lambda x: len(x.split()))
inpdata['header']=np.where(inpdata['num_words']<=5,inpdata['Sentences'],'')
inpdata = inpdata[inpdata['num_words'] > 0]
inpdata['Module']=inpdata.apply(module_assign,axis=1)
def split_the_sentence_to_words_rem_stopwrds(p_sentence):
return [token for token in word_tokenize(p_sentence)]
inpdata['split_wrd'] =inpdata['Sentences'].map(split_the_sentence_to_words_rem_stopwrds)
解决方案
您可以透视表并剥离字符串:
new_df = df.pivot_table(index=df.Module.eq('Payroll Pay Element').cumsum(),
columns='Module',
values='header',
aggfunc='first')
for col in ['Amount','Employee ID', 'Employee Name']:
new_df[col] = new_df[col].str[len(col)+1:]
print(new_df)
输出:
Module Amount Employee ID Employee Name Payroll Pay Element
Module
1 1200 USD 4564576 ANDY CHEN ADDITIONAL BONUS
2 200 USD 7463453 MICHAEL HISHAM SPOT AWARD
3 1400 USD 7467673 AYMAN HISHAM REALOCATION BONUS
4 1400 USD 7467673 AYMAN HISHAM REALOCATION BONUS
推荐阅读
- c# - 绑定到作为另一个对象的属性的 DataTable:System.ArgumentException 列不属于该表
- java - 如何从 lambda 中获取变量?
- python - 过滤熊猫数据框以限制在给定的日期范围内
- javascript - 如何标签过去关闭的模态链接?#a11y
- ios - 获取 Wifi(SSID) 列表 ios 应用程序并连接到我的应用程序是可能的吗?
- linux - 未投递的邮件退回给发件人 - 不同的域目的地
- google-cloud-spanner - CloudSpanner 是否支持模糊搜索或通配符搜索?
- google-cloud-platform - 围绕创建 VPC 访问连接器的困惑
- select - Azure AD Graph 组成员 oData 结果限制
- php - 此 Convert JSON in Array 返回为 null