diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 978f361159f3..cc0ecd3ca2ea 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -191,6 +191,8 @@ Bug Fixes * GITHUB#15939: Fix thread-safety issues with NFARunAutomaton. (Dimitris Rempapis) +* GITHUB#16246: Fix FVH StringIndexOutOfBoundsException with overlapping token offsets. (hanbj) + Changes in Runtime Behavior --------------------- * GITHUB#14187: The query cache is now disabled by default. (Adrien Grand) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java index 395afd5e7fe6..d6d7912518ff 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java @@ -214,18 +214,30 @@ protected String makeFragment( int srcIndex = 0; for (SubInfo subInfo : fragInfo.getSubInfos()) { for (Toffs to : subInfo.termsOffsets()) { + int toffsStart = to.getStartOffset() - modifiedStartOffset[0]; + int toffsEnd = to.getEndOffset() - modifiedStartOffset[0]; + // Skip tokens with truly invalid offsets that cannot be salvaged. + if (toffsEnd < toffsStart || toffsEnd > src.length() || toffsEnd <= 0) { + continue; + } + if (toffsStart < 0) { + toffsStart = 0; + } + // Handle overlapping tokens: analyzers (e.g. CJK bigram, ik_max_word) commonly + // produce tokens whose offsets overlap. Rather than skipping, highlight the + // non-overlapping tail so the full matched region is covered. + if (toffsStart < srcIndex) { + if (toffsEnd <= srcIndex) { + continue; + } + toffsStart = srcIndex; + } fragment - .append( - encoder.encodeText( - src.substring(srcIndex, to.getStartOffset() - modifiedStartOffset[0]))) + .append(encoder.encodeText(src.substring(srcIndex, toffsStart))) .append(getPreTag(preTags, subInfo.seqnum())) - .append( - encoder.encodeText( - src.substring( - to.getStartOffset() - modifiedStartOffset[0], - to.getEndOffset() - modifiedStartOffset[0]))) + .append(encoder.encodeText(src.substring(toffsStart, toffsEnd))) .append(getPostTag(postTags, subInfo.seqnum())); - srcIndex = to.getEndOffset() - modifiedStartOffset[0]; + srcIndex = toffsEnd; } } fragment.append(encoder.encodeText(src.substring(srcIndex))); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestBaseFragmentsBuilderOverlappingOffsets.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestBaseFragmentsBuilderOverlappingOffsets.java new file mode 100644 index 000000000000..db500b354e55 --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/TestBaseFragmentsBuilderOverlappingOffsets.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.vectorhighlight; + +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.search.highlight.DefaultEncoder; +import org.apache.lucene.search.highlight.Encoder; +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs; +import org.apache.lucene.tests.util.LuceneTestCase; + +/** + * Tests that BaseFragmentsBuilder handles overlapping token offsets correctly. + * + *
Analyzers like CJK bigram, ik_max_word, and other multi-granularity tokenizers produce tokens
+ * with overlapping offsets. For example, "中华人民共和国" may produce tokens at [0,2), [0,4), [1,3),
+ * [2,4), [2,7), [4,6), [4,7). The highlighter must handle these without throwing
+ * StringIndexOutOfBoundsException and should highlight the full matched region.
+ *
+ * @see Elasticsearch #73072
+ */
+public class TestBaseFragmentsBuilderOverlappingOffsets extends LuceneTestCase {
+
+ private static final Encoder ENCODER = new DefaultEncoder();
+ private static final String[] PRE_TAGS = {""};
+ private static final String[] POST_TAGS = {""};
+
+ private String makeFragment(String sourceText, WeightedFragInfo fragInfo) {
+ SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
+ FieldType ft = new FieldType(TextField.TYPE_STORED);
+ ft.setStoreTermVectors(true);
+ Field[] values = new Field[] {new Field("f", sourceText, ft)};
+ StringBuilder buffer = new StringBuilder();
+ int[] index = {0};
+ return sfb.makeFragment(buffer, index, values, fragInfo, PRE_TAGS, POST_TAGS, ENCODER);
+ }
+
+ /**
+ * Overlapping tokens [0,10) and [8,14): the second token starts before the first one ends. Before
+ * the fix this throws StringIndexOutOfBoundsException: Range [10, 8) out of bounds.
+ */
+ public void testOverlappingTokensHighlightExtendedRegion() {
+ String src = "hello world test data";
+
+ List