首页 > 解决方案 > 字数统计或从 doc_count 获取文档

问题描述

有没有办法在复合聚合后获取文档?假设我聚合后得到 doc_count = 5。我想知道 doc_count 中的这 5 个文档,因为我必须分析这些文档。或者有没有办法像我下面的例子一样计算单词。

我想计算有多少用户[ant,bird,cat,elep] 购买了产品 a。我的数据看起来像这些

{"Date":"20200515","product":["a","a","a","b","c"],"user":"ant","rank":"silver"}
{"Date":"20200515","product":["a","b","c","e","f"],"user":"ant","rank":"silver"}
{"Date":"20200515","product":["a","a","c","c","d"],"user":"bird","rank":"silver"}
{"Date":"20200515","product":["a","a","c","d","e"],"user":"cat","rank":"silver"}
{"Date":"20200515","product":["a","a","a","b","f"],"user":"cat","rank":"silver"}
{"Date":"20200515","product":["a","a","b","c","d"],"user":"elep","rank":"silver"}

我的查询看起来像这样

{
  "aggs":{
      "comp":{
         "composite":{
            "sources":[
               {
                  "log_date":{
                     "terms":{
                        "field":"Date.keyword"
                     }
                  }
               },
               {
                  "product":{
                     "terms":{
                        "field":"product.keyword",
                        "missing_bucket":true
                     }
                  }
               },
               {
                  "rank":{
                     "terms":{
                        "field":"rank.keyword",
                        "missing_bucket":true
                     }
                  }
               },
               {
                  "user":{
                     "terms":{
                        "field":"user.keyword",
                        "missing_bucket":true
                     }
                  }
               }
            ]
         }
      }
  }
}

这是我的结果

Date      user rank    product doc_count
20200515  ant  silver    a        2
20200515  bird silver    a        1  
20200515  cat  silver    a        2
20200515  elep silver    a        1
...

这是我的预期结果

Date      user rank    product doc_count amount
20200515  ant  silver    a        2        4
20200515  bird silver    a        1        2
20200515  cat  silver    a        2        5
20200515  elep silver    a        1        2

标签: elasticsearchelasticsearch-dsl

解决方案


您需要将您的产品字段转换为嵌套类型

映射:

{
  "mappings": {
    "properties": {
      "product":{
        "type": "nested",
        "properties": {
          "name":{
            "type":"text",
            "fields":{
              "keyword":{
                "type":"keyword"
              }
            }
          }
        }
      }
    }
  }
}

数据:

{
  "Date": "20200515",
  "product": [
    {
      "name": "a"
    },
    {
      "name": "a"
    },
    {
      "name": "a"
    },
    {
      "name": "a"
    },
    {
      "name": "b"
    },
    {
      "name": "c"
    }
  ],
  "user": "ant",
  "rank": "silver"
}

询问:

{
  "query": {
    "bool": {
      "filter": {
        "nested": {  --> use nested type to filter on product
          "path": "product",
          "query": {
            "match": {
              "product.name": "a"
            }
          }
        }
      }
    }
  },
  "aggs": {
    "user_count": { --> total count of users
      "cardinality": {
        "field": "user.keyword"
      }
    },
    "users": {
      "terms": {
        "field": "user.keyword",
        "size": 10
      },
      "aggs": {
        "product": {
          "nested": {
            "path": "product"
          },
          "aggs": {
            "product_name": {
              "terms": {
                "field": "product.name.keyword",
                "include":"a", --> include only specific value, accepts array
                "size": 10
              },
              "aggs": {
                "amount": {
                  "value_count": {
                    "field": "product.name.keyword"
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

结果

"hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.0,
    "hits" : [
      {
        "_index" : "index44",
        "_type" : "_doc",
        "_id" : "WtSYJXIBEIlbGJUZf3Ve",
        "_score" : 0.0,
        "_source" : {
          "Date" : "20200515",
          "product" : [
            {
              "name" : "a"
            },
            {
              "name" : "a"
            },
            {
              "name" : "a"
            },
            {
              "name" : "a"
            },
            {
              "name" : "b"
            },
            {
              "name" : "c"
            }
          ],
          "user" : "ant",
          "rank" : "silver"
        }
      }
    ]
  },
  "aggregations" : {
    "user_count" : {
      "value" : 1
    },
    "users" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "ant",
          "doc_count" : 1,
          "product" : {
            "doc_count" : 6,
            "product_name" : {
              "doc_count_error_upper_bound" : 0,
              "sum_other_doc_count" : 0,
              "buckets" : [
                {
                  "key" : "a",
                  "doc_count" : 4,
                  "amount" : {
                    "value" : 4
                  }
                }
              ]
            }
          }
        }
      ]
    }
  }

推荐阅读