ES版本
5.5.1
Settings
PUT goods_test/_settings
{
"index": {
"analysis": {
"filter": {
"native_synonym": {
"type": "synonym",
"ignore_case": true,
"expand": true,
"synonyms": [
"iphone8plus, iphone 8 plus"
]
}
},
"analyzer": {
"ik_smart_synonym_n": {
"filter": [
"native_synonym"
],
"tokenizer": "ik_smart"
}
}
}
}
}
Mappings
PUT goods_test/_mapping/fulltext
{
"fulltext": {
"_all": {
"enabled": false
},
"properties": {
"product_name": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart_synonym_n"
}
}
}
}
插入数据
PUT goods_test/fulltext/S20
{
"product_name": "iphone8plus手机"
}
PUT goods_test/fulltext/S21
{
"product_name": "iphone 8 plus手机"
}
搜索测试
GET goods_test/fulltext/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"product_name": {
"minimum_should_match": "100%",
"query": "iphone8plus",
"analyzer": "ik_smart_synonym_n"
}
}
}
]
}
},
"from": 0,
"size": 10
}
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.95561695,
"hits": [
{
"_index": "goods_test",
"_type": "fulltext",
"_id": "S20",
"_score": 0.95561695,
"_source": {
"product_name": "iphone8plus手机"
}
},
{
"_index": "goods_test",
"_type": "fulltext",
"_id": "S21",
"_score": 0.8630463,
"_source": {
"product_name": "iphone 8 plus手机"
}
}
]
}
}
GET goods_test/fulltext/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"product_name": {
"minimum_should_match": "100%",
"query": "iphone 8 plus",
"analyzer": "ik_smart_synonym_n"
}
}
}
]
}
},
"from": 0,
"size": 10
}
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.1299736,
"hits": [
{
"_index": "goods_test",
"_type": "fulltext",
"_id": "S20",
"_score": 1.1299736,
"_source": {
"product_name": "iphone8plus手机"
}
}
]
}
}
问题来了:为什么搜索"iphone 8 plus"只能搜到S20而搜不到S21?
分词测试
GET goods_test/_analyze
{
"text": "iphone8plus",
"analyzer": "ik_smart_synonym_n"
}
{
"tokens": [
{
"token": "iphone8plus",
"start_offset": 0,
"end_offset": 11,
"type": "LETTER",
"position": 0
},
{
"token": "iphone",
"start_offset": 0,
"end_offset": 11,
"type": "SYNONYM",
"position": 0
},
{
"token": "8",
"start_offset": 0,
"end_offset": 11,
"type": "SYNONYM",
"position": 1
},
{
"token": "plus",
"start_offset": 0,
"end_offset": 11,
"type": "SYNONYM",
"position": 2
}
]
}
GET goods_test/_analyze
{
"text": "iphone 8 plus",
"analyzer": "ik_smart_synonym_n"
}
{
"tokens": [
{
"token": "iphone",
"start_offset": 0,
"end_offset": 6,
"type": "ENGLISH",
"position": 0
},
{
"token": "iphone8plus",
"start_offset": 0,
"end_offset": 13,
"type": "SYNONYM",
"position": 0,
"positionLength": 3
},
{
"token": "8",
"start_offset": 7,
"end_offset": 8,
"type": "ARABIC",
"position": 1
},
{
"token": "plus",
"start_offset": 9,
"end_offset": 13,
"type": "ENGLISH",
"position": 2
}
]
}
可以看到区别在于"iphone 8 plus"的分词的同义词"iphone8plus"的positionLength=3
explain结果
GET goods_test/fulltext/S21/_explain
{
"query": {
"bool": {
"must": [
{
"match": {
"product_name": {
"minimum_should_match": "100%",
"query": "iphone8plus",
"analyzer": "ik_smart_synonym_n"
}
}
}
]
}
}
}
GET goods_test/fulltext/S21/_explain
{
"query": {
"bool": {
"must": [
{
"match": {
"product_name": {
"minimum_should_match": "100%",
"query": "iphone 8 plus",
"analyzer": "ik_smart_synonym_n"
}
}
}
]
}
}
}
看起来好像是:对于100%匹配,搜索"iphone 8 plus"时需要{"iphone", "8", "plus"}和{"iphone8plus"}都命中。这就很奇怪了,按照理解,应该是只要上面的两组分词有一组都命中就可以搜到S21才对呀。
问题总结
对于文中这样的同义词设置,es 5.5好像执行了不一样的匹配策略(同样的测试在es 2.3.0进行,两个doc都可以搜索到)。最近刚升级到es 5.5,望解答。
5.5.1
Settings
PUT goods_test/_settings
{
"index": {
"analysis": {
"filter": {
"native_synonym": {
"type": "synonym",
"ignore_case": true,
"expand": true,
"synonyms": [
"iphone8plus, iphone 8 plus"
]
}
},
"analyzer": {
"ik_smart_synonym_n": {
"filter": [
"native_synonym"
],
"tokenizer": "ik_smart"
}
}
}
}
}
Mappings
PUT goods_test/_mapping/fulltext
{
"fulltext": {
"_all": {
"enabled": false
},
"properties": {
"product_name": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart_synonym_n"
}
}
}
}
插入数据
PUT goods_test/fulltext/S20
{
"product_name": "iphone8plus手机"
}
PUT goods_test/fulltext/S21
{
"product_name": "iphone 8 plus手机"
}
搜索测试
- 搜索"iphone8plus"
GET goods_test/fulltext/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"product_name": {
"minimum_should_match": "100%",
"query": "iphone8plus",
"analyzer": "ik_smart_synonym_n"
}
}
}
]
}
},
"from": 0,
"size": 10
}
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.95561695,
"hits": [
{
"_index": "goods_test",
"_type": "fulltext",
"_id": "S20",
"_score": 0.95561695,
"_source": {
"product_name": "iphone8plus手机"
}
},
{
"_index": "goods_test",
"_type": "fulltext",
"_id": "S21",
"_score": 0.8630463,
"_source": {
"product_name": "iphone 8 plus手机"
}
}
]
}
}
- 搜索"iphone 8 plus"
GET goods_test/fulltext/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"product_name": {
"minimum_should_match": "100%",
"query": "iphone 8 plus",
"analyzer": "ik_smart_synonym_n"
}
}
}
]
}
},
"from": 0,
"size": 10
}
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.1299736,
"hits": [
{
"_index": "goods_test",
"_type": "fulltext",
"_id": "S20",
"_score": 1.1299736,
"_source": {
"product_name": "iphone8plus手机"
}
}
]
}
}
问题来了:为什么搜索"iphone 8 plus"只能搜到S20而搜不到S21?
分词测试
- 测试"iphone8plus"
GET goods_test/_analyze
{
"text": "iphone8plus",
"analyzer": "ik_smart_synonym_n"
}
{
"tokens": [
{
"token": "iphone8plus",
"start_offset": 0,
"end_offset": 11,
"type": "LETTER",
"position": 0
},
{
"token": "iphone",
"start_offset": 0,
"end_offset": 11,
"type": "SYNONYM",
"position": 0
},
{
"token": "8",
"start_offset": 0,
"end_offset": 11,
"type": "SYNONYM",
"position": 1
},
{
"token": "plus",
"start_offset": 0,
"end_offset": 11,
"type": "SYNONYM",
"position": 2
}
]
}
- 测试"iphone 8 plus"
GET goods_test/_analyze
{
"text": "iphone 8 plus",
"analyzer": "ik_smart_synonym_n"
}
{
"tokens": [
{
"token": "iphone",
"start_offset": 0,
"end_offset": 6,
"type": "ENGLISH",
"position": 0
},
{
"token": "iphone8plus",
"start_offset": 0,
"end_offset": 13,
"type": "SYNONYM",
"position": 0,
"positionLength": 3
},
{
"token": "8",
"start_offset": 7,
"end_offset": 8,
"type": "ARABIC",
"position": 1
},
{
"token": "plus",
"start_offset": 9,
"end_offset": 13,
"type": "ENGLISH",
"position": 2
}
]
}
可以看到区别在于"iphone 8 plus"的分词的同义词"iphone8plus"的positionLength=3
explain结果
- explain S21和"iphone8plus"
GET goods_test/fulltext/S21/_explain
{
"query": {
"bool": {
"must": [
{
"match": {
"product_name": {
"minimum_should_match": "100%",
"query": "iphone8plus",
"analyzer": "ik_smart_synonym_n"
}
}
}
]
}
}
}
- explain S21和"iphone 8 plus"
GET goods_test/fulltext/S21/_explain
{
"query": {
"bool": {
"must": [
{
"match": {
"product_name": {
"minimum_should_match": "100%",
"query": "iphone 8 plus",
"analyzer": "ik_smart_synonym_n"
}
}
}
]
}
}
}
看起来好像是:对于100%匹配,搜索"iphone 8 plus"时需要{"iphone", "8", "plus"}和{"iphone8plus"}都命中。这就很奇怪了,按照理解,应该是只要上面的两组分词有一组都命中就可以搜到S21才对呀。
问题总结
对于文中这样的同义词设置,es 5.5好像执行了不一样的匹配策略(同样的测试在es 2.3.0进行,两个doc都可以搜索到)。最近刚升级到es 5.5,望解答。
1 个回复
jasmhusc
赞同来自: