python - 对python列表中缺失信息进行排序的算法
问题描述
我正在阅读 PDF 文件并在将所有页面转换为图像后使用 pdf2image 将页面转换为图像 我正在使用 Opencv 从图像中提取特定表格 我正在使用通用图像处理技术,例如将图像转换为灰度并应用对其进行一些形态学操作以获取表格。
我面临的问题是我的代码无法读取表格的所有单元格,有时它会丢失数据,
#sort all the contours from top to bottom
(contours, boundingBoxes) = sort_contours(contours,method="top-to-bottom")
#crop all the boxes
idx = 0
for c in contours:
x,y,w,h = cv2.boundingRect(c)
if cv2.contourArea(c)<300000: #(w > 80 and h > 20) and w > 3*h:
idx += 1
new_img = img[y:y+h, x:x+w]
r = 1600/new_img.shape[1]
dim = (1600,int(new_img.shape[0]*r))
new_img = cv2.resize(new_img,dim)
cv2.imwrite(str(idx) + '.png',new_img)
#this function reads and returns text inside the cropped cell
val = highlight_text(new_img)
text.append(val)
text = list(filter(lambda x: x != " ", text))
os.chdir("..\\images_from_pdf")
return text
现在这个函数以从上到下的列表形式返回每个单元格中提取文本的值,即
['**particulars**'.'zohaib ali','**Name**', '1234','**identity**']
在此列表中,第一个值并不重要,因此我将其删除。之后,我对列表进行排序,以便交换两个值,即
['**Name**','zohaib ali','**identity**','1234']
现在它完全符合我需要将其存储在字典或 json 中的格式,但如果它遗漏了一个单独的值,整个订单就会混乱,我需要根据遗漏值的位置再次对内容进行排序。
预期结果:
现在我想以一种可以识别表中缺失值的方式对列表进行排序,但我无法这样做,所以我编写了另一个函数对其进行排序但无济于事,请帮忙
代表
import json
import re
def interchange_val(val):
for i in range(0,len(val)-1,2):
val[i],val[i+1] = val[i+1],val[i]
return val
def make_dict(val_list:list, form_keys:list)->dict:
test = [] #list to check the current values is either key or value if it is
a key then true else False
i= 0 #counter for val_list
j= 0 #counter for form_keys
val_dic = [] #final result after processing the list
'''
This loop will iterate over the lists and if it found the form_key in
correct position in case of val_list i.e
1. Name and its value
2. Identity and its value
4 Amount and its value
it will print [True,False,True,False,True,False]
In case val_list_1 it will produce an output like
[False,True,,False,True, True]
In case val_list_2 it will produce an output like
[True,False,False,False,True]
'''
while i<len(val_list) and j<len(form_keys):
print("inside loop")
keyword = form_keys[j].split()
print(keyword)
search_words = val_list[i]
print(search_words)
word = re.compile(rf'\b{keyword[0]}\b',flags=re.I|re.X)
y = re.findall(word,search_words)
print(y)
if (i == 0) and (len(y)>=1):
# val_dic.append("")
test.append(True)
# val_dic.append("")
j += 1
i += 1
continue
elif (i == 0) and (len(y)==0):
# val_dic.append("")
test.append(False)
# val_dic.append("")
j += 1
i += 1
continue
elif (i>0) and (len(y)>=1):
print("inside if")
# val_dic.append("")
test.append(True)
i+=1
# elif (i%2 == 1) and (len(y)>=2):
# # val_dic.append(val_list[i-1].upper())
# print("inside elif")
else:
i+=1
test.append(False)
# if i == len(val_list) and j<len(val_list):
# i = 0
# j+=1
# test.append("")
continue
j += 1
print(test)
'''
Now this loop will try to assemble all the data using test as the condition
'''
for i, val in enumerate(test):
if (i==0) and (val == True) and (test[i+1] == False):
val_dic.append(val_list[i+1])
continue
if (i==0) and (val == False):
val_dic.append("")
continue
elif (i>1) and (val == False) and (test[i-1] == True):
val_dic.append(val_list[i])
continue
elif (i>1) and (i<len(test)-1) and (val == False) and (test[i+1] == False):
# test[i+1] = True
# test[i+2] = False
# # val_dic.`append(val_list[i+1])
continue
elif (i>1) and (i<len(test)-1) and (val == True) and (test[i+1] == True):
val_dic.append("")
continue
return val_dic
def main_func():
#Correct Order of items
val_list = ["particulars", "Zohaib Ali", "Name", "1234", "Identity", "24", "Amount"]
#Incorrect Order of Items the value of Name field has been missed during reading the data
val_list_1 = ["particulars", "Name", "1234", "Identity", "24", "Amount"]
#Incorrect Order of Items the value of identity key has been missed during reading the data
val_list_2 = ["particulars","Zohaib", "Name", "1234", "24", "Amount"]
#keys that must be present in the form but might be missed during form reading
form_keys = ["Name",
"Identity",
"Amount"]
if val_list[0] == "particulars":
val_list.remove(val_list[0])
val = interchange_val(val_list)
x = make_dict(val_list,form_keys)
print(x)
if __name__ == "__main__":
main_func()
解决方案
推荐阅读
- ios - 如何传递给 SeconViewController 作为列表必须在 FirstViewController 中显示广泛的文本列表
- opengl - 为什么应用 PBR 定向光时不渲染阴影?
- mysql - mySQL中的简单乘积计算
- c# - 将字符串中出现的枚举值替换为枚举值的索引以进行字符串比较
- r - 在 R 中使用带有分组条形图和 facet_wrap 的 ggsignif 时如何定义数据
- typescript - 即使使用 AdministratorAccess 运行 CDK,S3 访问也被拒绝
- c# - 如何在 C# 中将文本文件中的单词添加到数组列表中
- javascript - 在 React JS 状态下更新输入字段值
- sql - 如何以正确的方式使用模型 - 用 typescript 续集
- javascript - 将搜索结果传递给反应中的另一个页面/组件