首页 > 解决方案 > 使用匹配短语查询后的 Elasticsearch 唯一文档

问题描述

嘿堆栈溢出我有一个如下所示的弹性搜索文档。我只对“标签”键感兴趣。

 "_index": "graph_20211025t0909",
                "_type": "_doc",
                "_id": "E12201A5-CC50-40AF-97AE-C54A2CA303F7",
                "_score": null,
                "_source": {
                    "entity_id": "E12201A5-CC50-40AF-97AE-C54A2CA303F7",
                    "properties": {
                        "external": {
                            "facebook": {
                                "id": "muji.jp"
                            },
                            "instagram": {
                                "id": "muji_global"
                            },
                            "twitter": {
                                "id": "muji_net"
                            },
                            "wikidata": {
                                "id": "Q708789"
                            }
                        },
                        "akas": [
                            {
                                "value": "Muji",
                                "language": "zh"
                            },
                            {
                                "value": "multinacional japonesa",
                                "language": "es"
                            },
                        ]
                    },
                    "data_source": {
                        "data_pull_date": "202109",
                        "source_id": "muji_global",
                        "dataset": "brand"
                    },
                    "scoring_entity_data_size": 5306,
                    "population_percentile": 0.9855572298745676,
                    "type_synonyms": [],
                    "@version": "1",
                    "@timestamp": "2021-10-25T16:28:24.892Z",
                    "name": "Muji",
                    "types": [
                        "urn:entity:brand"
                    ],
                    "tags": [
                        {
                            "tag_id": "D24DE9CF-C778-4468-8433-5A0E8AA2BA9D",
                            "name": "Wikipedia articles with GND identifiers",
                            "type": "urn:tag:wikipedia_category"
                        },
                        {
                            "tag_id": "67A608CC-2DA3-4C78-B7F6-6DD419744FFC",
                            "name": "Clothing brands of Japan",
                            "type": "urn:tag:wikipedia_category"
                        },
]
}

我的弹性搜索查询是

{
    "size": 20,
    "_source": ["tags"],
    "sort": [
        { "@timestamp": { "order": "desc" } }
    ],
    "query": {
        "nested" : {
            "path" : "tags",
                "query" : {
                    "bool" : {
                        "must" : [
                          { "match_phrase" : {"tags.name" : "thriller"} }
                        ]    
                }
            }
        }
    }
}

我的问题是我的查询如何根据我的 Elasticsearch 查询返回唯一文档?我正在“tags”字段中搜索“tags.name”。我希望我的“标签”字段返回一组独特的项目,例如我目前正在返回

tags: [
{
                        {
                            "name": "Male actors",
                            "tag_id": "A2A18D57-24B5-4578-B0D3-2A9190EEAD7C",
                            "type": "urn:tag:wikipedia_category"
                        },
                        {
                            "name": "some tag name",
                            "tag_id": "0CB4BE42-026F-4B14-A59A-C5A331E8A56F",
                            "type": "urn:tag:wikipedia_category"
                        },
    },
                        {
                            "name": "Male actors",
                            "tag_id": "A2A18D57-24B5-4578-B0D3-2A9190EEAD7C",
                            "type": "urn:tag:wikipedia_category"
                        },
                        {
                            "name": "another tag name",
                            "tag_id": "0CB4BE42-026F-4B14-A59A-C5A331E8A56F",
                            "type": "urn:tag:wikipedia_category"
                        },
}

]

我希望我的结果不重复“名字”:“男演员”

标签: elasticsearch

解决方案


从您的tags查询返回的来自不同的文档,因此您不能假设它们是唯一的。我建议使用聚合来获得唯一的tags.name

{
    "size": 20,
    "_source": ["tags"],
    "sort": [
        { "@timestamp": { "order": "desc" } }
    ],
    "query": {
        "nested" : {
            "path" : "tags",
                "query" : {
                    "bool" : {
                        "must" : [
                          { "match_phrase" : {"tags.name" : "thriller"} }
                        ]    
                }
            }
        }
    },
   "aggs": {
     "unique_tags": {
       "nested": {
         "path": "tags"
       },
       "aggs": {
         "tag_name": {
           "terms": {
              "field": "tags.name"
           }
         }
       }
    }
}

推荐阅读