首页 > 解决方案 > 查找具有相同值的字典并向它们添加新键

问题描述

有这样的疑问。我有一个字典列表,其中包含有关单词的信息,这些信息是使用 Google API 从图像中识别出来的。所以我的清单看起来像:

test_list = [
   {
      "value": "68004,",
      "location": {
         "TL": {
            "x": 351,
            "y": 0
         },
         "TR": {
            "x": 402,
            "y": 0
         },
         "BR": {
            "x": 402,
            "y": 12
         },
         "BL": {
            "x": 351,
            "y": 12
         }
      },
      "type": 1
   },
   {
      "value": "Чорномор",
      "location": {
         "TL": {
            "x": 415,
            "y": 0
         },
         "TR": {
            "x": 493,
            "y": 0
         },
         "BR": {
            "x": 493,
            "y": 12
         },
         "BL": {
            "x": 415,
            "y": 12
         }
      },
      "type": 1
   },
   {
      "value": "вулиця,",
      "location": {
         "TL": {
            "x": 495,
            "y": 14
         },
         "TR": {
            "x": 550,
            "y": 10
         },
         "BR": {
            "x": 551,
            "y": 22
         },
         "BL": {
            "x": 496,
            "y": 26
         }
      },
      "type": 1
   },
   {
      "value": "140,",
      "location": {
         "TL": {
            "x": 557,
            "y": 8
         },
         "TR": {
            "x": 576,
            "y": 7
         },
         "BR": {
            "x": 577,
            "y": 20
         },
         "BL": {
            "x": 558,
            "y": 21
         }
      },
      "type": 1
   },
   {
      "value": "кв.",
      "location": {
         "TL": {
            "x": 581,
            "y": 6
         },
         "TR": {
            "x": 605,
            "y": 4
         },
         "BR": {
            "x": 606,
            "y": 21
         },
         "BL": {
            "x": 582,
            "y": 23
         }
      },
      "type": 1
   },
   {
      "value": "77",
      "location": {
         "TL": {
            "x": 607,
            "y": 5
         },
         "TR": {
            "x": 628,
            "y": 4
         },
         "BR": {
            "x": 629,
            "y": 19
         },
         "BL": {
            "x": 608,
            "y": 21
         }
      },
      "type": 1
   },
]

因此,我想查找,如果某些字典具有相同的位置参数,并且如果为真,则向这些字典添加具有相同索引的新键“string_number”。如上面代码中的示例,两个第一个字典具有相同的 ["location"]["TL"]["y"] 和 ["location"]["TR"]["y"] == 0px,并且 [ "位置"]["BR"]["y"] 和 ["location"]["BL"]["y"] == 12 像素。这意味着这些单词被放置在真实文档中的一个字符串中,所以我想向它们添加索引为 0 的新键“string_number”。看起来像:

test_list = [
   {
      "value": "68004,",
      "location": {
         "TL": {
            "x": 351,
            "y": 0
         },
         "TR": {
            "x": 402,
            "y": 0
         },
         "BR": {
            "x": 402,
            "y": 12
         },
         "BL": {
            "x": 351,
            "y": 12
         }
      },
      "type": 1
      "string_number": 0
   },
   {
      "value": "Чорномор",
      "location": {
         "TL": {
            "x": 415,
            "y": 0
         },
         "TR": {
            "x": 493,
            "y": 0
         },
         "BR": {
            "x": 493,
            "y": 12
         },
         "BL": {
            "x": 415,
            "y": 12
         }
      },
      "type": 1
      "string_number": 0
   },

然后遍历列表的其余部分,我想找到每个这样的重复项并为它们设置相同的字符串索引。但是,有时像素可能相差 1-2 个点或更多(例如“y”:12 或 10 或 14 可能仍表示该单词在文档中的同一行)。那么对这种差异进行额外检查是真的吗?

编辑:所以我使用了 Aleksa Svitlica 的帮助并创建了一个类,这使得所有工作都可以在同一行上搜索单词。 所以它看起来像:

class WordParser():
    def __init__(self):
        self.list_wbw = self.load_json()
        self.next_new_string_number = 0

    def load_json(self):
        with io.open(calc_paths(status="now", path_type=PathType.OCR_JSON_WBW), 'r', encoding='utf-8') as content:
            self.list_wbw = json.load(content)
        content.close()
        return self.list_wbw

    def mark_images_on_same_line(self):
        number_of_images = len(self.list_wbw)
        for i in range(number_of_images):
            for j in range(i + 1, number_of_images):
                image1 = self.list_wbw[i]
                image2 = self.list_wbw[j]
                on_same_line = self._check_if_images_on_same_line(image1, image2)

                if on_same_line:
                    self._add_string_number_to_images(image1, image2)

    def print_images(self):
        print(json.dumps(self.list_wbw, indent=3, sort_keys=False, ensure_ascii=False))

    def _check_if_images_on_same_line(self, image1, image2):
        image1_top_left = image1["location"]["TL"]["y"]
        image1_top_right = image1["location"]["TR"]["y"]
        image1_bot_left = image1["location"]["BL"]["y"]
        image1_bot_right = image1["location"]["BR"]["y"]

        image2_top_left = image2["location"]["TL"]["y"]
        image2_top_right = image2["location"]["TR"]["y"]
        image2_bot_left = image2["location"]["BL"]["y"]
        image2_bot_right = image2["location"]["BR"]["y"]

        same_top_left_position = self._pixel_heights_match_within_threshold(image1_top_left, image2_top_left)
        same_top_right_position = self._pixel_heights_match_within_threshold(image1_top_right, image2_top_right)
        same_bot_left_position = self._pixel_heights_match_within_threshold(image1_bot_left, image2_bot_left)
        same_bot_right_position = self._pixel_heights_match_within_threshold(image1_bot_right, image2_bot_right)

        if same_top_left_position and same_top_right_position and same_bot_left_position and same_bot_right_position:
            self._add_string_number_to_images(image1, image2)

    def _add_string_number_to_images(self, image1, image2):
        string_number = self._determine_string_number(image1, image2)
        image1["string_number"] = string_number
        image2["string_number"] = string_number

    def _determine_string_number(self, image1, image2):
        string_number = self.next_new_string_number

        image1_number = image1.get("string_number")
        image2_number = image2.get("string_number")

        if image1_number is not None:
            string_number = image1_number
        elif image2_number is not None:
            string_number = image2_number
        else:
            self.next_new_string_number += 1

        return string_number

    def _pixel_heights_match_within_threshold(self, height1, height2, threshold=4):
        return abs(height1 - height2) <= threshold

在我的另一个模块中,我调用这些方法:

    word_parser = WordParser()
    word_parser.mark_images_on_same_line()
    word_parser.print_images()

标签: pythonlistdictionarysearchduplicates

解决方案


test_list在我得到输出后添加以下代码,您可以在下面看到。我的代码目前只检查 TR 和 TL 的高度是否在阈值内(默认为 2 像素阈值)。但是您可以根据您的要求对其进行修改。_check_if_images_on_same_line您可以根据需要更改规则。

import json

#-------------------------------------------------------------------
#---Classes---------------------------------------------------------
#-------------------------------------------------------------------
class ImageParser():
    def __init__(self, list_of_images):
        self.list_of_images = list_of_images
        self.next_new_string_number = 0

    # ----------------------------------------------------------------------------
    # ---Public-------------------------------------------------------------------
    # ----------------------------------------------------------------------------

    def mark_images_on_same_line(self):
        number_of_images = len(self.list_of_images)
        for i in range(number_of_images):
            for j in range(i+1, number_of_images):
                image1 = self.list_of_images[i]
                image2 = self.list_of_images[j]
                on_same_line = self._check_if_images_on_same_line(image1, image2)

                if on_same_line:
                    self._add_string_number_to_images(image1, image2)

    def print_images(self):
        print(json.dumps(self.list_of_images, indent=True, sort_keys=False, ensure_ascii=False))

    # ----------------------------------------------------------------------------
    # ---Private------------------------------------------------------------------
    # ----------------------------------------------------------------------------
    def _check_if_images_on_same_line(self, image1, image2):
        image1_top = image1["location"]["TL"]["y"]
        image1_bot = image1["location"]["BL"]["y"]

        image2_top = image2["location"]["TL"]["y"]
        image2_bot = image2["location"]["BL"]["y"]

        same_top_position = self._pixel_heights_match_within_threshold(image1_top, image2_top)
        same_bot_position = self._pixel_heights_match_within_threshold(image1_bot, image2_bot)

        if same_bot_position & same_top_position:
            self._add_string_number_to_images(image1, image2)

    def _add_string_number_to_images(self, image1, image2):
        string_number = self._determine_string_number(image1, image2)
        image1["string_number"] = string_number
        image2["string_number"] = string_number

    def _determine_string_number(self, image1, image2):
        string_number = self.next_new_string_number

        image1_number = image1.get("string_number")
        image2_number = image2.get("string_number")

        if image1_number is not None:
            string_number = image1_number
        elif image2_number is not None:
            string_number = image2_number
        else:
            self.next_new_string_number += 1

        return string_number

    def _pixel_heights_match_within_threshold(self, height1, height2, threshold=2):
        return abs(height1 - height2) <= threshold


#-------------------------------------------------------------------
#---Main------------------------------------------------------------
#-------------------------------------------------------------------
if __name__ == "__main__":
    image_parser = ImageParser(test_list)
    image_parser.mark_images_on_same_line()
    image_parser.print_images()

给出以下结果:

[
 {
  "value": "68004,",
  "location": {
   "TL": {
    "x": 351,
    "y": 0
   },
   "TR": {
    "x": 402,
    "y": 0
   },
   "BR": {
    "x": 402,
    "y": 12
   },
   "BL": {
    "x": 351,
    "y": 12
   }
  },
  "type": 1,
  "string_number": 0
 },
 {
  "value": "Чорномор",
  "location": {
   "TL": {
    "x": 415,
    "y": 0
   },
   "TR": {
    "x": 493,
    "y": 0
   },
   "BR": {
    "x": 493,
    "y": 12
   },
   "BL": {
    "x": 415,
    "y": 12
   }
  },
  "type": 1,
  "string_number": 0
 },
 {
  "value": "вулиця,",
  "location": {
   "TL": {
    "x": 495,
    "y": 14
   },
   "TR": {
    "x": 550,
    "y": 10
   },
   "BR": {
    "x": 551,
    "y": 22
   },
   "BL": {
    "x": 496,
    "y": 26
   }
  },
  "type": 1
 },
 {
  "value": "140,",
  "location": {
   "TL": {
    "x": 557,
    "y": 8
   },
   "TR": {
    "x": 576,
    "y": 7
   },
   "BR": {
    "x": 577,
    "y": 20
   },
   "BL": {
    "x": 558,
    "y": 21
   }
  },
  "type": 1,
  "string_number": 1
 },
 {
  "value": "кв.",
  "location": {
   "TL": {
    "x": 581,
    "y": 6
   },
   "TR": {
    "x": 605,
    "y": 4
   },
   "BR": {
    "x": 606,
    "y": 21
   },
   "BL": {
    "x": 582,
    "y": 23
   }
  },
  "type": 1,
  "string_number": 1
 },
 {
  "value": "77",
  "location": {
   "TL": {
    "x": 607,
    "y": 5
   },
   "TR": {
    "x": 628,
    "y": 4
   },
   "BR": {
    "x": 629,
    "y": 19
   },
   "BL": {
    "x": 608,
    "y": 21
   }
  },
  "type": 1,
  "string_number": 1
 }
]

推荐阅读