python - 查找具有相同值的字典并向它们添加新键
问题描述
有这样的疑问。我有一个字典列表,其中包含有关单词的信息,这些信息是使用 Google API 从图像中识别出来的。所以我的清单看起来像:
test_list = [
{
"value": "68004,",
"location": {
"TL": {
"x": 351,
"y": 0
},
"TR": {
"x": 402,
"y": 0
},
"BR": {
"x": 402,
"y": 12
},
"BL": {
"x": 351,
"y": 12
}
},
"type": 1
},
{
"value": "Чорномор",
"location": {
"TL": {
"x": 415,
"y": 0
},
"TR": {
"x": 493,
"y": 0
},
"BR": {
"x": 493,
"y": 12
},
"BL": {
"x": 415,
"y": 12
}
},
"type": 1
},
{
"value": "вулиця,",
"location": {
"TL": {
"x": 495,
"y": 14
},
"TR": {
"x": 550,
"y": 10
},
"BR": {
"x": 551,
"y": 22
},
"BL": {
"x": 496,
"y": 26
}
},
"type": 1
},
{
"value": "140,",
"location": {
"TL": {
"x": 557,
"y": 8
},
"TR": {
"x": 576,
"y": 7
},
"BR": {
"x": 577,
"y": 20
},
"BL": {
"x": 558,
"y": 21
}
},
"type": 1
},
{
"value": "кв.",
"location": {
"TL": {
"x": 581,
"y": 6
},
"TR": {
"x": 605,
"y": 4
},
"BR": {
"x": 606,
"y": 21
},
"BL": {
"x": 582,
"y": 23
}
},
"type": 1
},
{
"value": "77",
"location": {
"TL": {
"x": 607,
"y": 5
},
"TR": {
"x": 628,
"y": 4
},
"BR": {
"x": 629,
"y": 19
},
"BL": {
"x": 608,
"y": 21
}
},
"type": 1
},
]
因此,我想查找,如果某些字典具有相同的位置参数,并且如果为真,则向这些字典添加具有相同索引的新键“string_number”。如上面代码中的示例,两个第一个字典具有相同的 ["location"]["TL"]["y"] 和 ["location"]["TR"]["y"] == 0px,并且 [ "位置"]["BR"]["y"] 和 ["location"]["BL"]["y"] == 12 像素。这意味着这些单词被放置在真实文档中的一个字符串中,所以我想向它们添加索引为 0 的新键“string_number”。看起来像:
test_list = [
{
"value": "68004,",
"location": {
"TL": {
"x": 351,
"y": 0
},
"TR": {
"x": 402,
"y": 0
},
"BR": {
"x": 402,
"y": 12
},
"BL": {
"x": 351,
"y": 12
}
},
"type": 1
"string_number": 0
},
{
"value": "Чорномор",
"location": {
"TL": {
"x": 415,
"y": 0
},
"TR": {
"x": 493,
"y": 0
},
"BR": {
"x": 493,
"y": 12
},
"BL": {
"x": 415,
"y": 12
}
},
"type": 1
"string_number": 0
},
然后遍历列表的其余部分,我想找到每个这样的重复项并为它们设置相同的字符串索引。但是,有时像素可能相差 1-2 个点或更多(例如“y”:12 或 10 或 14 可能仍表示该单词在文档中的同一行)。那么对这种差异进行额外检查是真的吗?
编辑:所以我使用了 Aleksa Svitlica 的帮助并创建了一个类,这使得所有工作都可以在同一行上搜索单词。 所以它看起来像:
class WordParser():
def __init__(self):
self.list_wbw = self.load_json()
self.next_new_string_number = 0
def load_json(self):
with io.open(calc_paths(status="now", path_type=PathType.OCR_JSON_WBW), 'r', encoding='utf-8') as content:
self.list_wbw = json.load(content)
content.close()
return self.list_wbw
def mark_images_on_same_line(self):
number_of_images = len(self.list_wbw)
for i in range(number_of_images):
for j in range(i + 1, number_of_images):
image1 = self.list_wbw[i]
image2 = self.list_wbw[j]
on_same_line = self._check_if_images_on_same_line(image1, image2)
if on_same_line:
self._add_string_number_to_images(image1, image2)
def print_images(self):
print(json.dumps(self.list_wbw, indent=3, sort_keys=False, ensure_ascii=False))
def _check_if_images_on_same_line(self, image1, image2):
image1_top_left = image1["location"]["TL"]["y"]
image1_top_right = image1["location"]["TR"]["y"]
image1_bot_left = image1["location"]["BL"]["y"]
image1_bot_right = image1["location"]["BR"]["y"]
image2_top_left = image2["location"]["TL"]["y"]
image2_top_right = image2["location"]["TR"]["y"]
image2_bot_left = image2["location"]["BL"]["y"]
image2_bot_right = image2["location"]["BR"]["y"]
same_top_left_position = self._pixel_heights_match_within_threshold(image1_top_left, image2_top_left)
same_top_right_position = self._pixel_heights_match_within_threshold(image1_top_right, image2_top_right)
same_bot_left_position = self._pixel_heights_match_within_threshold(image1_bot_left, image2_bot_left)
same_bot_right_position = self._pixel_heights_match_within_threshold(image1_bot_right, image2_bot_right)
if same_top_left_position and same_top_right_position and same_bot_left_position and same_bot_right_position:
self._add_string_number_to_images(image1, image2)
def _add_string_number_to_images(self, image1, image2):
string_number = self._determine_string_number(image1, image2)
image1["string_number"] = string_number
image2["string_number"] = string_number
def _determine_string_number(self, image1, image2):
string_number = self.next_new_string_number
image1_number = image1.get("string_number")
image2_number = image2.get("string_number")
if image1_number is not None:
string_number = image1_number
elif image2_number is not None:
string_number = image2_number
else:
self.next_new_string_number += 1
return string_number
def _pixel_heights_match_within_threshold(self, height1, height2, threshold=4):
return abs(height1 - height2) <= threshold
在我的另一个模块中,我调用这些方法:
word_parser = WordParser()
word_parser.mark_images_on_same_line()
word_parser.print_images()
解决方案
test_list
在我得到输出后添加以下代码,您可以在下面看到。我的代码目前只检查 TR 和 TL 的高度是否在阈值内(默认为 2 像素阈值)。但是您可以根据您的要求对其进行修改。_check_if_images_on_same_line
您可以根据需要更改规则。
import json
#-------------------------------------------------------------------
#---Classes---------------------------------------------------------
#-------------------------------------------------------------------
class ImageParser():
def __init__(self, list_of_images):
self.list_of_images = list_of_images
self.next_new_string_number = 0
# ----------------------------------------------------------------------------
# ---Public-------------------------------------------------------------------
# ----------------------------------------------------------------------------
def mark_images_on_same_line(self):
number_of_images = len(self.list_of_images)
for i in range(number_of_images):
for j in range(i+1, number_of_images):
image1 = self.list_of_images[i]
image2 = self.list_of_images[j]
on_same_line = self._check_if_images_on_same_line(image1, image2)
if on_same_line:
self._add_string_number_to_images(image1, image2)
def print_images(self):
print(json.dumps(self.list_of_images, indent=True, sort_keys=False, ensure_ascii=False))
# ----------------------------------------------------------------------------
# ---Private------------------------------------------------------------------
# ----------------------------------------------------------------------------
def _check_if_images_on_same_line(self, image1, image2):
image1_top = image1["location"]["TL"]["y"]
image1_bot = image1["location"]["BL"]["y"]
image2_top = image2["location"]["TL"]["y"]
image2_bot = image2["location"]["BL"]["y"]
same_top_position = self._pixel_heights_match_within_threshold(image1_top, image2_top)
same_bot_position = self._pixel_heights_match_within_threshold(image1_bot, image2_bot)
if same_bot_position & same_top_position:
self._add_string_number_to_images(image1, image2)
def _add_string_number_to_images(self, image1, image2):
string_number = self._determine_string_number(image1, image2)
image1["string_number"] = string_number
image2["string_number"] = string_number
def _determine_string_number(self, image1, image2):
string_number = self.next_new_string_number
image1_number = image1.get("string_number")
image2_number = image2.get("string_number")
if image1_number is not None:
string_number = image1_number
elif image2_number is not None:
string_number = image2_number
else:
self.next_new_string_number += 1
return string_number
def _pixel_heights_match_within_threshold(self, height1, height2, threshold=2):
return abs(height1 - height2) <= threshold
#-------------------------------------------------------------------
#---Main------------------------------------------------------------
#-------------------------------------------------------------------
if __name__ == "__main__":
image_parser = ImageParser(test_list)
image_parser.mark_images_on_same_line()
image_parser.print_images()
给出以下结果:
[
{
"value": "68004,",
"location": {
"TL": {
"x": 351,
"y": 0
},
"TR": {
"x": 402,
"y": 0
},
"BR": {
"x": 402,
"y": 12
},
"BL": {
"x": 351,
"y": 12
}
},
"type": 1,
"string_number": 0
},
{
"value": "Чорномор",
"location": {
"TL": {
"x": 415,
"y": 0
},
"TR": {
"x": 493,
"y": 0
},
"BR": {
"x": 493,
"y": 12
},
"BL": {
"x": 415,
"y": 12
}
},
"type": 1,
"string_number": 0
},
{
"value": "вулиця,",
"location": {
"TL": {
"x": 495,
"y": 14
},
"TR": {
"x": 550,
"y": 10
},
"BR": {
"x": 551,
"y": 22
},
"BL": {
"x": 496,
"y": 26
}
},
"type": 1
},
{
"value": "140,",
"location": {
"TL": {
"x": 557,
"y": 8
},
"TR": {
"x": 576,
"y": 7
},
"BR": {
"x": 577,
"y": 20
},
"BL": {
"x": 558,
"y": 21
}
},
"type": 1,
"string_number": 1
},
{
"value": "кв.",
"location": {
"TL": {
"x": 581,
"y": 6
},
"TR": {
"x": 605,
"y": 4
},
"BR": {
"x": 606,
"y": 21
},
"BL": {
"x": 582,
"y": 23
}
},
"type": 1,
"string_number": 1
},
{
"value": "77",
"location": {
"TL": {
"x": 607,
"y": 5
},
"TR": {
"x": 628,
"y": 4
},
"BR": {
"x": 629,
"y": 19
},
"BL": {
"x": 608,
"y": 21
}
},
"type": 1,
"string_number": 1
}
]
推荐阅读
- python - 返回 beta 系数 B0 和 B1 的函数
- visual-studio - 如何在 Visual Studio 2019 中覆盖 Vue.js 客户端脚本的源映射,以便调试器在断点处停止?
- winapi - 如何将 IMFSample 从 Windows Duplication API 编码为 H264?
- arduino - XMLHTTP --- ERR_CONNECTION_RESET
- javascript - 在 React 中显示单击元素的详细信息
- javascript - Jquery函数检查图像的SRCset是否完成加载
- asp.net-core - Net Core 3.1 中的 DpapiDataProtectionProvider 替代方案
- c# - Visual Studio 社区上的不兼容目标框架
- php - Laravel 6.10 -> 路由 [admin.dashboard] 未定义。但其他路线正在运行
- operating-system - 为什么虚拟到物理内存地址转换需要专门的硬件而不是文件路径解码?