三人行必有我师

elasticsearch6.7.1 ,ik_max_word叠词问题导致start_offset值非增序,lucene分词词元校验不通过插入失败

Elasticsearch | 作者 Bill Don | 发布于2019年10月28日 | 阅读数:3587

elasticsearch6.7.1       elasticsearch-analyzer-ik-6.7.1
 
索引配置如下
PUT test
{
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_replicas": "0",
"number_of_shards": "1",
"max_result_window": "2000000000",
"analysis": {
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram",
"min_gram": 1,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
}
},
"filter": {
"synonym_filter": {
"type": "synonym",
"synonyms_path": "dic/synonym.dic"
}
},
"analyzer": {
"text_ik": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter": [ "synonym_filter" ]
}
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"filecontent": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "text_ik",
"search_analyzer": "ik_max_word"
}
}
}
}
}

插入文本见附件
 
使用java执行插入时,
		byte data = Files.readAllBytes(Paths.get("data.txt"));
String txt = new String(data);
JSONObject obj = new JSONObject();
obj.put("filecontent", txt.replaceAll("(\r\n|\r|\n|\n\r)", " "));
Request.Post("http://localhost:9200/test/_doc/1").bodyString(obj.toJSONString(), ContentType.APPLICATION_JSON).execute();

es容器报错如下
[2019-10-28T16:30:54,528][DEBUG][o.e.a.b.TransportShardBulkAction] [6pV94Eq] [test][0] failed to execute bulk item (index) index {[test][_doc][1], source[n/a, actual length: [1.4mb], max length: 2kb]}

java.lang.IllegalArgumentException: startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards startOffset=140968,endOffset=140969,lastStartOffset=140969 for field 'filecontent'
at org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:824) ~[lucene-core-7.7.0.jar:7.7.0 8c831daf4eb41153c25ddb152501ab5bae3ea3d5 - jimczi - 2019-02-04 23:16:28]
at org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) ~[lucene-core-7.7.0.jar:7.7.0 8c831daf4eb41153c25ddb152501ab5bae3ea3d5 - jimczi - 2019-02-04 23:16:28]
at org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:394) ~[lucene-core-7.7.0.jar:7.7.0 8c831daf4eb41153c25ddb152501ab5bae3ea3d5 - jimczi - 2019-02-04 23:16:28]
at org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:251) ~[lucene-core-7.7.0.jar:7.7.0 8c831daf4eb41153c25ddb152501ab5bae3ea3d5 - jimczi - 2019-02-04 23:16:28]
at org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:494) ~[lucene-core-7.7.0.jar:7.7.0 8c831daf4eb41153c25ddb152501ab5bae3ea3d5 - jimczi - 2019-02-04 23:16:28]
at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1616) ~[lucene-core-7.7.0.jar:7.7.0 8c831daf4eb41153c25ddb152501ab5bae3ea3d5 - jimczi - 2019-02-04 23:16:28]
at org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1235) ~[lucene-core-7.7.0.jar:7.7.0 8c831daf4eb41153c25ddb152501ab5bae3ea3d5 - jimczi - 2019-02-04 23:16:28]
at org.elasticsearch.index.engine.InternalEngine.addDocs(InternalEngine.java:1164) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1109) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:924) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:824) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:791) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:744) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.bulk.TransportShardBulkAction.lambda$executeIndexRequestOnPrimary$3(TransportShardBulkAction.java:454) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.bulk.TransportShardBulkAction.executeOnPrimaryWhileHandlingMappingUpdates(TransportShardBulkAction.java:477) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:452) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:216) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:159) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:151) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:139) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:79) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform(TransportReplicationAction.java:1050) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform(TransportReplicationAction.java:1028) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:105) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.runWithPrimaryShardReference(TransportReplicationAction.java:424) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.lambda$doRun$0(TransportReplicationAction.java:370) ~[elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.ActionListener$1.onResponse(ActionListener.java:61) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:273) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:240) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2561) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryOperationPermit(TransportReplicationAction.java:987) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun(TransportReplicationAction.java:369) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:324) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:311) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.xpack.security.transport.SecurityServerTransportInterceptor$ProfileSecuredRequestHandler$1.doRun(SecurityServerTransportInterceptor.java:250) [x-pack-security-6.7.1.jar:6.7.1]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.xpack.security.transport.SecurityServerTransportInterceptor$ProfileSecuredRequestHandler.messageReceived(SecurityServerTransportInterceptor.java:308) [x-pack-security-6.7.1.jar:6.7.1]
at org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:686) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:751) [elasticsearch-6.7.1.jar:6.7.1]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) [elasticsearch-6.7.1.jar:6.7.1]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_151]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_151]
at java.lang.Thread.run(Thread.java:748) [?:1.8.0_151]

我在github上有看到类似的错误https://github.com/elastic/ela ... 32694 ,然后我尝试把synonym修改成synonym_graph,重新生成索引之后仍然报错,之后把text_ik 修改成 ik_max_word,仍然报错,最后把分词器全部去掉,插入成功了,将分词器修改成ik_smart, 插入也成功了。哪里出错了?
已邀请:

Bill Don

赞同来自:

微信图片_20191028183141.png

通过分析器将文本按照ik_max_word分词后,找到报错的start_offset , 发现前后start_offset 先变小,后变大,相差1;这是ik的bug吗? 
 
lucene7.7.0源码: 对分词校验
微信图片_20191029100232.png


这个问题和输出的 term 里面有叠词有关吧。
短文本  ”得饶人处且饶人“,设置字段为 ik_max_word 也能重现该错误!
 

要回复问题请先登录注册