在not_analyzed字符串上的elasticsearch术语聚合返回的文档数非常少的存储桶

Question

While playing with AWS Elasticsearch (2.3), I loaded it with some sample data https://www.elastic.co/guide/en/kibana/3.0/snippets/shakespeare.json with following mapping 在使用AWS Elasticsearch（2.3）时，我将其加载了一些示例数据https://www.elastic.co/guide/en/kibana/3.0/snippets/shakespeare.json，并具有以下映射

$ curl --url "https://my_es_id.us-east-1.es.amazonaws.com/shakespeare/_mapping"

{
    "shakespeare": {
        "mappings": {
            "act": {
                "properties": {
                    "line_id": {
                        "type": "integer"
                    },
                    "line_number": {
                        "type": "string"
                    },
                    "play_name": {
                        "fields": {
                            "raw": {
                                "index": "not_analyzed",
                                "type": "string"
                            }
                        },
                        "type": "string"
                    },
                    "speaker": {
                        "fields": {
                            "raw": {
                                "index": "not_analyzed",
                                "type": "string"
                            }
                        },
                        "type": "string"
                    },
                    "speech_number": {
                        "type": "integer"
                    },
                    "text_entry": {
                        "type": "string"
                    }
                }
            },
            "line": {
                "properties": {
                    "line_id": {
                        "type": "integer"
                    },
                    "line_number": {
                        "type": "string"
                    },
                    "play_name": {
                        "type": "string"
                    },
                    "speaker": {
                        "type": "string"
                    },
                    "speech_number": {
                        "type": "integer"
                    },
                    "text_entry": {
                        "type": "string"
                    }
                }
            },
            "scene": {
                "properties": {
                    "line_id": {
                        "type": "integer"
                    },
                    "line_number": {
                        "type": "string"
                    },
                    "play_name": {
                        "type": "string"
                    },
                    "speaker": {
                        "type": "string"
                    },
                    "speech_number": {
                        "type": "integer"
                    },
                    "text_entry": {
                        "type": "string"
                    }
                }
            }
        }
    }
}

Now when I run a query to get speaker counts over the entire data I get the following result. 现在，当我运行查询以获取整个数据的说话者人数时，我得到以下结果。

$ curl -XPOST "https://my_es_id.us-east-1.es.amazonaws.com/shakespeare/_search" -d'
{
    "aggs" : {
        "speakers" : {
            "terms" : { "field" : "speaker.raw"}
        }
    }
}'

{
    "_shards": {
        "failed": 0,
        "successful": 5,
        "total": 5
    },
    "aggregations": {
        "speakers": {
            "buckets": [
                {
                    "doc_count": 4,
                    "key": "BASTARD"
                },
                {
                    "doc_count": 3,
                    "key": "HAMLET"
                },
                {
                    "doc_count": 3,
                    "key": "KING HENRY VIII"
                },
                {
                    "doc_count": 3,
                    "key": "OF SYRACUSE"
                },
                {
                    "doc_count": 3,
                    "key": "PROSPERO"
                },
                {
                    "doc_count": 3,
                    "key": "WARWICK"
                },
                {
                    "doc_count": 2,
                    "key": "ADRIANO DE ARMADO"
                },
                {
                    "doc_count": 2,
                    "key": "ARCHBISHOP OF YORK"
                },
                {
                    "doc_count": 2,
                    "key": "AUFIDIUS"
                },
                {
                    "doc_count": 2,
                    "key": "BENEDICK"
                }
            ],
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 153
        }
    },
    "hits": {
        "hits": [
            {
                "_id": "0",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 1,
                    "line_number": "",
                    "play_name": "Henry IV",
                    "speaker": "",
                    "speech_number": "",
                    "text_entry": "ACT I"
                },
                "_type": "act"
            },
            {
                "_id": "14",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 15,
                    "line_number": "1.1.12",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "Did lately meet in the intestine shock"
                },
                "_type": "line"
            },
            {
                "_id": "19",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 20,
                    "line_number": "1.1.17",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "The edge of war, like an ill-sheathed knife,"
                },
                "_type": "line"
            },
            {
                "_id": "22",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 23,
                    "line_number": "1.1.20",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "Whose soldier now, under whose blessed cross"
                },
                "_type": "line"
            },
            {
                "_id": "24",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 25,
                    "line_number": "1.1.22",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "Forthwith a power of English shall we levy;"
                },
                "_type": "line"
            },
            {
                "_id": "25",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 26,
                    "line_number": "1.1.23",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "Whose arms were moulded in their mothers womb"
                },
                "_type": "line"
            },
            {
                "_id": "26",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 27,
                    "line_number": "1.1.24",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "To chase these pagans in those holy fields"
                },
                "_type": "line"
            },
            {
                "_id": "29",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 30,
                    "line_number": "1.1.27",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "For our advantage on the bitter cross."
                },
                "_type": "line"
            },
            {
                "_id": "40",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 41,
                    "line_number": "1.1.38",
                    "play_name": "Henry IV",
                    "speaker": "WESTMORELAND",
                    "speech_number": 2,
                    "text_entry": "Whose worst was, that the noble Mortimer,"
                },
                "_type": "line"
            },
            {
                "_id": "41",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 42,
                    "line_number": "1.1.39",
                    "play_name": "Henry IV",
                    "speaker": "WESTMORELAND",
                    "speech_number": 2,
                    "text_entry": "Leading the men of Herefordshire to fight"
                },
                "_type": "line"
            }
        ],
        "max_score": 1.0,
        "total": 111396
    },
    "timed_out": false,
    "took": 28
}

The document counts in the buckets of aggregation seem abysmally low. 聚合桶中的文档数量似乎非常低。 What I expected to see is the following speakers with doc counts (the following I calculated by explicitly evaluating the speaker count for the entire data): 我希望看到的是以下具有文档计数的演讲者（以下是我通过明确评估整个数据的演讲者人数计算得出的）：

GLOUCESTER 1920
HAMLET 1582
IAGO 1161
FALSTAFF 1117
KING HENRY V 1086
BRUTUS 1051
OTHELLO 928
MARK ANTONY 927
KING HENRY VI 917
DUKE VINCENTIO 909

I have spent several hours searching online the reason for this problem but I can't understand. 我已经花了几个小时在网上搜索此问题的原因，但我听不懂。 What am I doing wrong? 我究竟做错了什么？

Answer 1

The root cause is the bug in mapping and way the data is being searched. 根本原因是映射和数据搜索方式中的错误。 Mapping is set for only doc_type:'act' when it should be set for doc_type:'line', also the search shouldn't be over everything instead just doc_type:'line'. 仅当为doc_type：'line'设置映射时，才为doc_type：'line'设置映射，并且搜索不应覆盖所有内容，而应仅覆盖doc_type：'line'。

Detailed answer: 详细答案：

Following the example from this page: https://www.elastic.co/guide/en/elasticsearch/guide/current/aggregations-and-analysis.html I realized the bug was in the mapping. 遵循此页面上的示例： https : //www.elastic.co/guide/zh-CN/elasticsearch/guide/current/aggregations-and-analysis.html我意识到该错误出在映射中。

Before: 之前：

I didn't realize the original dataset had multiple doc_types. 我没有意识到原始数据集具有多个doc_types。
In the mapping, only the doc_type:'act' had the field:'speaker' with a not_analyzed version 在映射中，只有doc_type：'act'的字段：'speaker'具有not_analyzed版本
I was searching without setting any doc_type 我搜索时未设置任何doc_type
I was expecting the results would bucket the speakers from doc_type:line when in reality those doc_type didn't have any 'speaker.raw' attribute at all. 我原本希望结果将使doc_type：line中的发言人陷入困境，而实际上这些doc_type根本没有任何“ speaker.raw”属性。
Given this, the count in the question is also wrong. 鉴于此，问题中的计数也是错误的。

After: 后：

The new mapping adds a multifield for field:'speaker' for each of the doc_types: act/scene/line. 新的映射为每个doc_types：act / scene / line的field：“ speaker”添加了一个多字段。 This is speaker.raw and not analyzed. 这是speaker.raw，未经分析。
The new search, correctly searches for speakers of the lines, which was the original intent. 新的搜索正确地搜索了行的发言者，这是原始意图。
The results from elastic search now match the count I'm manually getting from this dataset. 弹性搜索的结果现在与我从该数据集中手动获得的计数匹配。 The current count for top 10 speakers in the doc_type:line is as follows: doc_type：line中排名前10位的发言人的当前计数如下：
GLOUCESTER 1907 HAMLET 1572 IAGO 1153 FALSTAFF 1109 KING HENRY V 1076 BRUTUS 1043 OTHELLO 928 MARK ANTONY 915 KING HENRY VI 909 DUKE VINCENTIO 901 GLOUCESTER 1907 HAMLET 1572 IAGO 1153 FALSTAFF 1109 KING HENRY V 1076 BRUTUS 1043 OTHELLO 928 MARK ANTONY 915 KING HENRY VI 909 DUKE VINCENTIO 901

Here is the correct mapping: 这是正确的映射：

{
  "shakespeare" : {
    "mappings" : {
      "line" : {
        "properties" : {
          "line_id" : {
            "type" : "integer"
          },
          "line_number" : {
            "type" : "string"
          },
          "play_name" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "speaker" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "speech_number" : {
            "type" : "integer"
          },
          "text_entry" : {
            "type" : "string"
          }
        }
      },
      "act" : {
        "properties" : {
          "line_id" : {
            "type" : "integer"
          },
          "line_number" : {
            "type" : "string"
          },
          "play_name" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "speaker" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "speech_number" : {
            "type" : "integer"
          },
          "text_entry" : {
            "type" : "string"
          }
        }
      },
      "scene" : {
        "properties" : {
          "line_id" : {
            "type" : "integer"
          },
          "line_number" : {
            "type" : "string"
          },
          "play_name" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "speaker" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "speech_number" : {
            "type" : "integer"
          },
          "text_entry" : {
            "type" : "string"
          }
        }
      }
    }
  }
}

With the new mapping in place the results look right: 使用新的映射后，结果看起来正确：

curl -XPOST "https://my_es_id/shakespeare/line/_search" -d'
{
    "aggs" : {
        "speakers" : {
            "terms" : { "field" : "speaker.raw"}
        }
    }
}'
{
    "_shards": {
        "failed": 0,
        "successful": 5,
        "total": 5
    },
    "aggregations": {
        "speakers": {
            "buckets": [
                {
                    "doc_count": 1907,
                    "key": "GLOUCESTER"
                },
                {
                    "doc_count": 1572,
                    "key": "HAMLET"
                },
                {
                    "doc_count": 1153,
                    "key": "IAGO"
                },
                {
                    "doc_count": 1109,
                    "key": "FALSTAFF"
                },
                {
                    "doc_count": 1076,
                    "key": "KING HENRY V"
                },
                {
                    "doc_count": 1043,
                    "key": "BRUTUS"
                },
                {
                    "doc_count": 928,
                    "key": "OTHELLO"
                },
                {
                    "doc_count": 915,
                    "key": "MARK ANTONY"
                },
                {
                    "doc_count": 909,
                    "key": "KING HENRY VI"
                },
                {
                    "doc_count": 901,
                    "key": "DUKE VINCENTIO"
                }
            ],
            "doc_count_error_upper_bound": 461,
            "sum_other_doc_count": 94715
        }
    },
    "hits": {
        "hits": [
            {
                "_id": "14",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 15,
                    "line_number": "1.1.12",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "Did lately meet in the intestine shock"
                },
                "_type": "line"
            },
            {
                "_id": "19",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 20,
                    "line_number": "1.1.17",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "The edge of war, like an ill-sheathed knife,"
                },
                "_type": "line"
            },
            {
                "_id": "22",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 23,
                    "line_number": "1.1.20",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "Whose soldier now, under whose blessed cross"
                },
                "_type": "line"
            },
            {
                "_id": "24",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 25,
                    "line_number": "1.1.22",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "Forthwith a power of English shall we levy;"
                },
                "_type": "line"
            },
            {
                "_id": "25",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 26,
                    "line_number": "1.1.23",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "Whose arms were moulded in their mothers womb"
                },
                "_type": "line"
            },
            {
                "_id": "26",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 27,
                    "line_number": "1.1.24",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "To chase these pagans in those holy fields"
                },
                "_type": "line"
            },
            {
                "_id": "29",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 30,
                    "line_number": "1.1.27",
                    "play_name": "Henry IV",
                    "speaker": "KING HENRY IV",
                    "speech_number": 1,
                    "text_entry": "For our advantage on the bitter cross."
                },
                "_type": "line"
            },
            {
                "_id": "40",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 41,
                    "line_number": "1.1.38",
                    "play_name": "Henry IV",
                    "speaker": "WESTMORELAND",
                    "speech_number": 2,
                    "text_entry": "Whose worst was, that the noble Mortimer,"
                },
                "_type": "line"
            },
            {
                "_id": "41",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 42,
                    "line_number": "1.1.39",
                    "play_name": "Henry IV",
                    "speaker": "WESTMORELAND",
                    "speech_number": 2,
                    "text_entry": "Leading the men of Herefordshire to fight"
                },
                "_type": "line"
            },
            {
                "_id": "44",
                "_index": "shakespeare",
                "_score": 1.0,
                "_source": {
                    "line_id": 45,
                    "line_number": "1.1.42",
                    "play_name": "Henry IV",
                    "speaker": "WESTMORELAND",
                    "speech_number": 2,
                    "text_entry": "A thousand of his people butchered;"
                },
                "_type": "line"
            }
        ],
        "max_score": 1.0,
        "total": 106228
    },
    "timed_out": false,
    "took": 48
}

在not_analyzed字符串上的elasticsearch术语聚合返回的文档数非常少的存储桶

问题描述

1 个解决方案

解决方案1
0 已采纳 2016-09-22 23:35:23

在not_analyzed字符串上的elasticsearch术语聚合返回的文档数非常少的存储桶

问题描述

1 个解决方案

解决方案1 0 已采纳 2016-09-22 23:35:23

解决方案1
0 已采纳 2016-09-22 23:35:23