提问:布和纸怕什么?

分片性能与检索结果数量精确性

Elasticsearch | 作者 lll479964143 | 发布于2021年11月12日 | 阅读数:1065

我有3000万的数据,业务场景为全文检索,es服务买的阿里的es服务,3数据节点,然后这3000万数据占用磁盘空间大小120G左右,当前设置分片数量为9(实际数据先用空间低于预期,计划修改为3)。
现在出现了一个问题:在进行检索时,我要统计一下符合检索条件的数量,相同的检索条件每次返回的统计数量不一致(检索结果300万左右,数量波动在十万级左右)。网上说这种每次结果不一致是因为分片数量的问题,请问这个问题是因为分片数量不为1吗?如果是,分片的性能优化,与检索结果的一致性问题该怎么解决?
mapping如下:
PUT /medpeer_library_article_optimization_shard
{
"settings": {
"number_of_shards": 9,
"index.mapping.nested_objects.limit" : 300000
},
"mappings": {
"dynamic":"false",
"properties": {
"article_id": {
"type": "keyword"
},
"article_pmid":{
"type": "keyword"
},
"article_doi":{
"type": "keyword"
},
"article_name":{
"type":"text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"journal_info":{
"type": "object",
"properties": {
"journal_code": {
"type": "keyword"
},
"journal_name": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"fields": {
"journal_title":{
"type" : "keyword"
}
}
}
}
},
"article_year":{
"type": "integer"
},
"published_time": {
"type":"date"
},
"published_time_precision":{
"type":"keyword"
},
"cite":{
"type":"keyword"
},
"is_del":{
"type": "integer"
},
"create_time" : {
"type":"date"
},
"last_time" : {
"type":"date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"impact_factor":{
"type":"float"
},
"referenced_article":{
"type":"integer"
},
"abstract_section":{
"type": "nested",
"properties": {
"section_attr": {
"type": "keyword"
},
"eng_text": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"chs_text": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
}
}
},
"issn_info":{
"type": "object",
"properties": {
"issn_code": {
"type": "keyword"
},
"eissn_code": {
"type": "keyword"
}
}
},
"author_info":{
"type": "nested",
"properties": {
"author_id": {
"type": "keyword"
},
"author_name": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"fields": {
"author_all_name":{
"type" : "keyword"
}
}
},
"first_author":{
"type":"integer"
}

}
},
"unit_info":{
"type": "nested",
"properties": {
"unit_id": {
"type": "keyword"
},
"unit_name": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"fields": {
"unit_all_name":{
"type" : "keyword"
}
}
}
}
},
"type_info":{
"type": "nested",
"properties": {
"type_id": {
"type": "keyword"
},
"type_name": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
}
}
},
"article_language":{
"type": "nested",
"properties": {
"marc_code": {
"type": "keyword"
},
"name_eng": {
"type": "keyword"
},
"name_chs": {
"type": "keyword"
}
}
},
"keyword_info":{
"type": "nested",
"properties": {
"keyword_id": {
"type": "keyword"
},
"eng_keyword": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"fields": {
"eng_word":{
"type" : "keyword"
}
}
},
"chs_keyword": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"fields": {
"chs_word":{
"type" : "keyword"
}
}
}
}
},
"descriptor_mesh_info":{
"type": "nested",
"properties": {
"descriptor_id": {
"type": "keyword"
},
"mesh_descriptor": {
"type": "keyword"
},
"mesh_descriptor_chs":{
"type":"keyword"
}
}
},
"qualifier_mesh_info":{
"type": "nested",
"properties": {
"qualifier_id": {
"type": "keyword"
},
"mesh_qualifier": {
"type": "keyword"
},
"mesh_qualifier_chs":{
"type":"keyword"
}
}
}
}
}

}
检索语句
POST medpeer_library_article_optimization_shard/_count
{
"query": {
"bool": {
"should": [
{
"match": {
"article_name": {
"query": "cell",
"boost" : 10
}
}
},
{
"match_phrase_prefix": {
"article_name": {
"query": "cell",
"boost" : 4,
"max_expansions" : 50
}
}
},
{
"nested": {
"path": "abstract_section",
"query": {
"match_phrase_prefix" : {
"abstract_section.eng_text" : {
"query" : "cell",
"boost" : 2,
"max_expansions" : 50
}
}
}
}
},
{
"nested": {
"path": "abstract_section",
"query": {
"match_phrase_prefix" : {
"abstract_section.chs_text" : {
"query" : "cell",
"boost" : 2,
"max_expansions" : 50
}
}
}
}
}
],
"filter": [
{
"match": {
"is_del": 0
}
}
],
"minimum_should_match": 1
}
}
}
已邀请:

caster_QL

赞同来自:

分片可能存在由于主副分片不一致导致的查询结果不一致;
分片间会导致全文算分以及聚合等结果的准确性。
你这个查询不一致感觉是主副不一致导致的,尝试一下force merge一下索引再查询试试

zcc_vv - 95

赞同来自:

别用 match_phrase_prefix 试下 我感觉是主副节点expansions出来的词不一样导致的

MarvinLiu

赞同来自:

查询语句加一个perference参数:
search-search.html#search-preference

要回复问题请先登录注册