求助,搜索时由于逆文档频率,匹配到更多分词的文档没有被放在最前面
匿名 | 发布于2021年08月06日 | 阅读数:1551
这是我的查询语句,有用到nested,cross_fields
GET archive/_explain/1
{
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"nested": {
"query": {
"multi_match": {
"query": "集体土地使用权登记审批表",
"fields": [
"documents.remark^1.0",
"documents.title^1.0"
],
"type": "cross_fields"
}
},
"path": "documents",
"score_mode": "max"
}
},
{
"multi_match": {
"query": "集体土地使用权登记审批表",
"fields": [
"column1^1.0",
"column3^1.0",
"column4^1.0",
"column5^1.0",
"column6^1.0",
"column7^1.0",
"column8^1.0",
"column9^1.0",
"remark^1.0",
"title^1.0"
],
"type": "cross_fields"
}
}
]
}
}
]
}
}
}
这是我希望得到的文档的explain{
"_index" : "archive_20210805_ansj",
"_type" : "_doc",
"_id" : "1",
"matched" : true,
"explanation" : {
"value" : 6.763243,
"description" : "sum of:",
"details" : [
{
"value" : 6.763243,
"description" : "sum of:",
"details" : [
{
"value" : 6.763243,
"description" : "Score based on 4 child docs in range from 97628 to 97638, best match:",
"details" : [
{
"value" : 6.763243,
"description" : "sum of:",
"details" : [
{
"value" : 6.763243,
"description" : "sum of:",
"details" : [
{
"value" : 2.568368,
"description" : "max of:",
"details" : [
{
"value" : 2.568368,
"description" : "weight(documents.title:集体土地 in 97628) [PerFieldSimilarity], result of:",
"details" : [
{
"value" : 2.568368,
"description" : "score(freq=1.0), computed as boost * idf * tf from:",
"details" : [
{
"value" : 2.2,
"description" : "boost",
"details" : [ ]
},
{
"value" : 2.684873,
"description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details" : [
{
"value" : 89785,
"description" : "n, number of documents containing term",
"details" : [ ]
},
{
"value" : 1315926,
"description" : "N, total number of documents with field",
"details" : [ ]
}
]
},
{
"value" : 0.4348213,
"description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details" : [
{
"value" : 1.0,
"description" : "freq, occurrences of term within document",
"details" : [ ]
},
{
"value" : 1.2,
"description" : "k1, term saturation parameter",
"details" : [ ]
},
{
"value" : 0.75,
"description" : "b, length normalization parameter",
"details" : [ ]
},
{
"value" : 21.0,
"description" : "dl, length of field",
"details" : [ ]
},
{
"value" : 18.903868,
"description" : "avgdl, average length of field",
"details" : [ ]
}
]
}
]
}
]
}
]
},
{
"value" : 1.6056693,
"description" : "max of:",
"details" : [
{
"value" : 1.6056693,
"description" : "weight(documents.title:使用权 in 97628) [PerFieldSimilarity], result of:",
"details" : [
{
"value" : 1.6056693,
"description" : "score(freq=1.0), computed as boost * idf * tf from:",
"details" : [
{
"value" : 2.2,
"description" : "boost",
"details" : [ ]
},
{
"value" : 1.678505,
"description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details" : [
{
"value" : 245621,
"description" : "n, number of documents containing term",
"details" : [ ]
},
{
"value" : 1315926,
"description" : "N, total number of documents with field",
"details" : [ ]
}
]
},
{
"value" : 0.4348213,
"description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details" : [
{
"value" : 1.0,
"description" : "freq, occurrences of term within document",
"details" : [ ]
},
{
"value" : 1.2,
"description" : "k1, term saturation parameter",
"details" : [ ]
},
{
"value" : 0.75,
"description" : "b, length normalization parameter",
"details" : [ ]
},
{
"value" : 21.0,
"description" : "dl, length of field",
"details" : [ ]
},
{
"value" : 18.903868,
"description" : "avgdl, average length of field",
"details" : [ ]
}
]
}
]
}
]
}
]
},
{
"value" : 0.6689305,
"description" : "max of:",
"details" : [
{
"value" : 0.6689305,
"description" : "weight(documents.title:登记 in 97628) [PerFieldSimilarity], result of:",
"details" : [
{
"value" : 0.6689305,
"description" : "score(freq=1.0), computed as boost * idf * tf from:",
"details" : [
{
"value" : 2.2,
"description" : "boost",
"details" : [ ]
},
{
"value" : 0.6992742,
"description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details" : [
{
"value" : 653944,
"description" : "n, number of documents containing term",
"details" : [ ]
},
{
"value" : 1315926,
"description" : "N, total number of documents with field",
"details" : [ ]
}
]
},
{
"value" : 0.4348213,
"description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details" : [
{
"value" : 1.0,
"description" : "freq, occurrences of term within document",
"details" : [ ]
},
{
"value" : 1.2,
"description" : "k1, term saturation parameter",
"details" : [ ]
},
{
"value" : 0.75,
"description" : "b, length normalization parameter",
"details" : [ ]
},
{
"value" : 21.0,
"description" : "dl, length of field",
"details" : [ ]
},
{
"value" : 18.903868,
"description" : "avgdl, average length of field",
"details" : [ ]
}
]
}
]
}
]
}
]
},
{
"value" : 1.9202754,
"description" : "max of:",
"details" : [
{
"value" : 1.9202754,
"description" : "weight(documents.title:审批表 in 97628) [PerFieldSimilarity], result of:",
"details" : [
{
"value" : 1.9202754,
"description" : "score(freq=1.0), computed as boost * idf * tf from:",
"details" : [
{
"value" : 2.2,
"description" : "boost",
"details" : [ ]
},
{
"value" : 2.0073822,
"description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details" : [
{
"value" : 176781,
"description" : "n, number of documents containing term",
"details" : [ ]
},
{
"value" : 1315926,
"description" : "N, total number of documents with field",
"details" : [ ]
}
]
},
{
"value" : 0.4348213,
"description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details" : [
{
"value" : 1.0,
"description" : "freq, occurrences of term within document",
"details" : [ ]
},
{
"value" : 1.2,
"description" : "k1, term saturation parameter",
"details" : [ ]
},
{
"value" : 0.75,
"description" : "b, length normalization parameter",
"details" : [ ]
},
{
"value" : 21.0,
"description" : "dl, length of field",
"details" : [ ]
},
{
"value" : 18.903868,
"description" : "avgdl, average length of field",
"details" : [ ]
}
]
}
]
}
]
}
]
}
]
},
{
"value" : 0.0,
"description" : "match on required clause, product of:",
"details" : [
{
"value" : 0.0,
"description" : "# clause",
"details" : [ ]
},
{
"value" : 1.0,
"description" : "_type:__documents",
"details" : [ ]
}
]
}
]
}
]
}
]
},
{
"value" : 0.0,
"description" : "match on required clause, product of:",
"details" : [
{
"value" : 0.0,
"description" : "# clause",
"details" : [ ]
},
{
"value" : 1.0,
"description" : "DocValuesFieldExistsQuery [field=_primary_term]",
"details" : [ ]
}
]
}
]
}
}
这是我实际搜索后得分最高者的explain{
"_index" : "archive_20210805_ansj",
"_type" : "_doc",
"_id" : "34589",
"matched" : true,
"explanation" : {
"value" : 10.107214,
"description" : "sum of:",
"details" : [
{
"value" : 10.107214,
"description" : "sum of:",
"details" : [
{
"value" : 1.8079908,
"description" : "Score based on 21 child docs in range from 871847 to 871867, best match:",
"details" : [
{
"value" : 1.8079908,
"description" : "sum of:",
"details" : [
{
"value" : 1.8079908,
"description" : "sum of:",
"details" : [
{
"value" : 1.8079908,
"description" : "max of:",
"details" : [
{
"value" : 1.8079908,
"description" : "weight(documents.title:审批表 in 131437) [PerFieldSimilarity], result of:",
"details" : [
{
"value" : 1.8079908,
"description" : "score(freq=1.0), computed as boost * idf * tf from:",
"details" : [
{
"value" : 2.2,
"description" : "boost",
"details" : [ ]
},
{
"value" : 2.0073822,
"description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details" : [
{
"value" : 176781,
"description" : "n, number of documents containing term",
"details" : [ ]
},
{
"value" : 1315926,
"description" : "N, total number of documents with field",
"details" : [ ]
}
]
},
{
"value" : 0.40939593,
"description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details" : [
{
"value" : 1.0,
"description" : "freq, occurrences of term within document",
"details" : [ ]
},
{
"value" : 1.2,
"description" : "k1, term saturation parameter",
"details" : [ ]
},
{
"value" : 0.75,
"description" : "b, length normalization parameter",
"details" : [ ]
},
{
"value" : 24.0,
"description" : "dl, length of field",
"details" : [ ]
},
{
"value" : 18.903868,
"description" : "avgdl, average length of field",
"details" : [ ]
}
]
}
]
}
]
}
]
}
]
},
{
"value" : 0.0,
"description" : "match on required clause, product of:",
"details" : [
{
"value" : 0.0,
"description" : "# clause",
"details" : [ ]
},
{
"value" : 1.0,
"description" : "_type:__documents",
"details" : [ ]
}
]
}
]
}
]
},
{
"value" : 8.299223,
"description" : "max of:",
"details" : [
{
"value" : 8.299223,
"description" : "weight(title:审批表 in 131438) [PerFieldSimilarity], result of:",
"details" : [
{
"value" : 8.299223,
"description" : "score(freq=1.0), computed as boost * idf * tf from:",
"details" : [
{
"value" : 2.2,
"description" : "boost",
"details" : [ ]
},
{
"value" : 6.4560037,
"description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details" : [
{
"value" : 1003,
"description" : "n, number of documents containing term",
"details" : [ ]
},
{
"value" : 638739,
"description" : "N, total number of documents with field",
"details" : [ ]
}
]
},
{
"value" : 0.58432037,
"description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details" : [
{
"value" : 1.0,
"description" : "freq, occurrences of term within document",
"details" : [ ]
},
{
"value" : 1.2,
"description" : "k1, term saturation parameter",
"details" : [ ]
},
{
"value" : 0.75,
"description" : "b, length normalization parameter",
"details" : [ ]
},
{
"value" : 26.0,
"description" : "dl, length of field",
"details" : [ ]
},
{
"value" : 56.880337,
"description" : "avgdl, average length of field",
"details" : [ ]
}
]
}
]
}
]
}
]
}
]
},
{
"value" : 0.0,
"description" : "match on required clause, product of:",
"details" : [
{
"value" : 0.0,
"description" : "# clause",
"details" : [ ]
},
{
"value" : 1.0,
"description" : "DocValuesFieldExistsQuery [field=_primary_term]",
"details" : [ ]
}
]
}
]
}
}
可以看到我的数据里,nested类型的document.title有很多是“集体土地使用权登记审批表”,但是由于数量太多,得分还没有一个title里包含“审批表”的高。
1 个回复
CurryQin
赞同来自: