要不要再翻翻文档呢?

Elasticsearch使用类似having查询满足条件的数据

Elasticsearch | 作者 F91 | 发布于2019年07月16日 | 阅读数:7476

背景:
某公司在网上做一项调查,对公司员工的基本信息进行收集。目前得到一批数据进行分析。
现在有以下问题需要进行分析
数据:
100w家,每个公司有N个部门,每个部门有N个员工
 

要求:
查询满足【同一家公司同一部门里,年龄在25-30岁之间的男性员工超过10人】的公司有哪些?
数据结构该如何设计?如图想了两种数据结构,不知道那种合适。
Elasticsearch能否像SQL的having count>N的这种方式作为query的条件筛选么?
ES版本5.4.3


备注:以上场景根据实际业务进行改编,数据量要比以上说明要多很多


 
 
312321313.png

 
已邀请:

medcl - 今晚打老虎。

赞同来自: laoyang360

DELETE index

PUT /index
{
  "mappings": {
    "properties": {
      "company":{
        "type": "keyword"
      },
      "employee": {
        "type": "nested",
        "properties": {
          "department": {
            "type": "keyword"
          },
          "name": {
            "type": "text"
          },
          "age": {
            "type": "integer"
          }
        }
      }
    }
  }
}

PUT index/_doc/1
{
  "company": "A",
  "employee": [
    {
      "department": "研发",
      "name": "张三",
      "age": 28
    },
     {
      "department": "研发",
      "name": "刘麻子",
      "age": 30
    },
    {
      "department": "研发",
      "name": "李四",
      "age": 20
    }
  ]
}

PUT index/_doc/2
{
  "company": "B",  
  "employee": [
    {
      "department": "研发",
      "name": "王五",
      "age": 24
    },
    {
      "department": "研发",
      "name": "刘六",
      "age": 20
    }
  ]
}

PUT index/_doc/3
{
  "company": "C",
  "employee": [
    {
      "department": "测试",
      "name": "翠花",
      "age": 25
    },
    {
      "department": "测试",
      "name": "凤儿",
      "age": 30
    },
    {
      "department": "测试",
      "name": "小兰",
      "age": 28
    },
    {
      "department": "研发",
      "name": "小陈",
      "age": 25
    },
    {
      "department": "研发",
      "name": "小宋",
      "age": 30
    },
    {
      "department": "研发",
      "name": "小吴",
      "age": 28
    }
  ]
}

GET /index/_search
{
  "size": 0,
  "aggs": {
    "company": {
      "terms": {
        "field": "company",
        "size": 10
      },
      "aggs": {
        "employee": {
          "nested": {
            "path": "employee"
          },
          "aggs": {
            "departments": {
              "terms": {
                "field": "employee.department"
              },
              "aggs": {
                "employee_count": {
                  "range": {
                    "field": "employee.age",
                    "ranges": [
                      {
                        "from": 25,
                        "to": 31
                      }
                    ]
                  },
                  "aggs": {
                    "my_filter": {
                      "bucket_selector": {
                        "buckets_path": {
                          "the_doc_count": "_count"
                        },
                        "script": "params.the_doc_count > 2"
                      }
                    }
                  }
                },
                "my_filter": {
                  "bucket_selector": {
                    "buckets_path": {
                      "the_doc_count": "employee_count._bucket_count"
                    },
                    "script": "params.the_doc_count > 0"
                  }
                }
              }
            }
          }
        },
        "min_bucket_selector": {
          "bucket_selector": {
            "buckets_path": {
              "count": "employee>departments._bucket_count"
            },
            "script": {
              "source": "params.count != 0"
            }
          }
        }
      }
    }
  }
}



 结果如下:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"company" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "C",
"doc_count" : 1,
"employee" : {
"doc_count" : 6,
"departments" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "测试",
"doc_count" : 3,
"employee_count" : {
"buckets" : [
{
"key" : "25.0-31.0",
"from" : 25.0,
"to" : 31.0,
"doc_count" : 3
}
]
}
},
{
"key" : "研发",
"doc_count" : 3,
"employee_count" : {
"buckets" : [
{
"key" : "25.0-31.0",
"from" : 25.0,
"to" : 31.0,
"doc_count" : 3
}
]
}
}
]
}
}
}
]
}
}
}
只有 C 公司满足需求。

params.the_doc_count > 2 这里的 2 换成实际的最小的员工数,比如题中的 10。
 

zqc0512 - andy zhou

赞同来自:

这个搞ES SQL吧。

talon - 80后IT男

赞同来自:

可以看下 having count : https://stackoverflow.com/ques ... 53947

F91 - 90后IT男,.Net

赞同来自:

使用Elasticsearch aggs 和 bucket_selector 能满足 group 和 having的效果。即便是Nested文档结构的也能查出来。但是怎么让他在Nested文档结构中作为一个子查询的条件进行筛选父文档呢?

laoyang360 - 《一本书讲透Elasticsearch》作者,Elastic认证工程师 [死磕Elasitcsearch]知识星球地址:http://t.cn/RmwM3N9;微信公众号:铭毅天下; 博客:https://elastic.blog.csdn.net

赞同来自:

推荐使用bucket_selector实现,

sun_tie

赞同来自:

我测试到一个方案,思路是先选出同一公司同一部门,符合年龄在25-30岁之间的男性员工的最大数,对最大数使用bucket_selector,判断,最大值超过10,则选出该公司,供参考,
PUT company_employee_mapping
{
"mappings": {
"properties": {
"company_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"department": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"No": {
"type": "long"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"sex": {
"type": "keyword"
},
"age": {
"type": "integer"
}
}
}
}


POST /_bulk
{ "index" : { "_index" : "company_employee_mapping", "_id" : "1" } }
{ "no" : 1, "name":"zhangsan", "age":25, "company_name":"xiaomi", "department":"HR","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "2" } }
{ "no" : 2, "name":"lisi", "age":30, "company_name":"xiaomi", "department":"HR","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "3" } }
{ "no" : 3, "name":"wangwu", "age":27, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "4" } }
{ "no" : 4, "name":"zhaoliu", "age":29, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "5" } }
{ "no" : 5, "name":"fengqi", "age":35, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "6" } }
{ "no" : 6, "name":"laoliu", "age":28, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "7" } }
{ "no" : 2, "name":"lisi", "age":30, "company_name":"xiaomi", "department":"HR","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "8" } }
{ "no" : 3, "name":"wangwu", "age":27, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "9" } }
{ "no" : 4, "name":"zhaoliu", "age":29, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "10" } }
{ "no" : 5, "name":"fengqi", "age":35, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "16" } }
{ "no" : 6, "name":"laoliu", "age":28, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "12" } }
{ "no" : 2, "name":"lisi", "age":30, "company_name":"xiaomi", "department":"HR","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "13" } }
{ "no" : 3, "name":"wangwu", "age":27, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "14" } }
{ "no" : 4, "name":"zhaoliu", "age":29, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "15" } }
{ "no" : 5, "name":"fengqi", "age":35, "company_name":"xiaomi", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "118" } }
{ "no" : 6, "name":"laoliu", "age":28, "company_name":"huawei", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "112" } }
{ "no" : 2, "name":"lisi", "age":30, "company_name":"huawei", "department":"HR","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "113" } }
{ "no" : 3, "name":"wangwu", "age":27, "company_name":"huawei", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "114" } }
{ "no" : 4, "name":"zhaoliu", "age":29, "company_name":"huawei", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "115" } }
{ "no" : 5, "name":"fengqi", "age":35, "company_name":"huawei", "department":"Marketing","sex":"male"}
{ "index" : { "_index" : "company_employee_mapping", "_id" : "117" } }
{ "no" : 6, "name":"laoliu", "age":28, "company_name":"huawei", "department":"Marketing","sex":"male"}


GET company_employee_mapping/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"age": {
"gte": 25,
"lte": 31
}
}
},
{
"match": {
"sex": "male"
}
}
]
}
},
"aggs": {
"company": {
"terms": {
"field": "company_name.keyword"
},
"aggs": {
"department": {
"terms": {
"field": "department.keyword"
}
},
"max_employee_count": {
"max_bucket": {
"buckets_path": "department._count"
}
},
"my_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"employee_count": "max_employee_count"
},
"script": "params.employee_count > 5"
}
}
}
}
}
}



返回结果如下:
{
"took" : 7,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 17,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"company" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "xiaomi",
"doc_count" : 12,
"department" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Marketing",
"doc_count" : 8
},
{
"key" : "HR",
"doc_count" : 4
}
]
},
"max_employee_count" : {
"value" : 8.0,
"keys" : [
"Marketing"
]
}
}
]
}
}
}

要回复问题请先登录注册