apache · hanbj · Jun 11, 2026 · Jun 12, 2026
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -191,6 +191,8 @@ Bug Fixes
 
 * GITHUB#15939: Fix thread-safety issues with NFARunAutomaton. (Dimitris Rempapis)
 
+* GITHUB#16246: Fix FVH StringIndexOutOfBoundsException with overlapping token offsets. (hanbj)
+
 Changes in Runtime Behavior
 ---------------------
 * GITHUB#14187: The query cache is now disabled by default. (Adrien Grand)

diff --git a/...e/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java b/...e/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
@@ -214,18 +214,30 @@ protected String makeFragment(
     int srcIndex = 0;
     for (SubInfo subInfo : fragInfo.getSubInfos()) {
       for (Toffs to : subInfo.termsOffsets()) {
+        int toffsStart = to.getStartOffset() - modifiedStartOffset[0];
+        int toffsEnd = to.getEndOffset() - modifiedStartOffset[0];
+        // Skip tokens with truly invalid offsets that cannot be salvaged.
+        if (toffsEnd < toffsStart || toffsEnd > src.length() || toffsEnd <= 0) {
+          continue;
+        }
+        if (toffsStart < 0) {
+          toffsStart = 0;
+        }
+        // Handle overlapping tokens: analyzers (e.g. CJK bigram, ik_max_word) commonly
+        // produce tokens whose offsets overlap. Rather than skipping, highlight the
+        // non-overlapping tail so the full matched region is covered.
+        if (toffsStart < srcIndex) {
+          if (toffsEnd <= srcIndex) {
+            continue;
+          }
+          toffsStart = srcIndex;
+        }
         fragment
-            .append(
-                encoder.encodeText(
-                    src.substring(srcIndex, to.getStartOffset() - modifiedStartOffset[0])))
+            .append(encoder.encodeText(src.substring(srcIndex, toffsStart)))
             .append(getPreTag(preTags, subInfo.seqnum()))
-            .append(
-                encoder.encodeText(
-                    src.substring(
-                        to.getStartOffset() - modifiedStartOffset[0],
-                        to.getEndOffset() - modifiedStartOffset[0])))
+            .append(encoder.encodeText(src.substring(toffsStart, toffsEnd)))
             .append(getPostTag(postTags, subInfo.seqnum()));
-        srcIndex = to.getEndOffset() - modifiedStartOffset[0];
+        srcIndex = toffsEnd;
       }
     }
     fragment.append(encoder.encodeText(src.substring(srcIndex)));

diff --git a/.../org/apache/lucene/search/vectorhighlight/TestBaseFragmentsBuilderOverlappingOffsets.java b/.../org/apache/lucene/search/vectorhighlight/TestBaseFragmentsBuilderOverlappingOffsets.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.vectorhighlight;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.search.highlight.DefaultEncoder;
+import org.apache.lucene.search.highlight.Encoder;
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+/**
+ * Tests that BaseFragmentsBuilder handles overlapping token offsets correctly.
+ *
+ * <p>Analyzers like CJK bigram, ik_max_word, and other multi-granularity tokenizers produce tokens
+ * with overlapping offsets. For example, "中华人民共和国" may produce tokens at [0,2), [0,4), [1,3),
+ * [2,4), [2,7), [4,6), [4,7). The highlighter must handle these without throwing
+ * StringIndexOutOfBoundsException and should highlight the full matched region.
+ *
+ * @see <a href="https://github.com/elastic/elasticsearch/issues/73072">Elasticsearch #73072</a>
+ */
+public class TestBaseFragmentsBuilderOverlappingOffsets extends LuceneTestCase {
+
+  private static final Encoder ENCODER = new DefaultEncoder();
+  private static final String[] PRE_TAGS = {"<b>"};
+  private static final String[] POST_TAGS = {"</b>"};
+
+  private String makeFragment(String sourceText, WeightedFragInfo fragInfo) {
+    SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
+    FieldType ft = new FieldType(TextField.TYPE_STORED);
+    ft.setStoreTermVectors(true);
+    Field[] values = new Field[] {new Field("f", sourceText, ft)};
+    StringBuilder buffer = new StringBuilder();
+    int[] index = {0};
+    return sfb.makeFragment(buffer, index, values, fragInfo, PRE_TAGS, POST_TAGS, ENCODER);
+  }
+
+  /**
+   * Overlapping tokens [0,10) and [8,14): the second token starts before the first one ends. Before
+   * the fix this throws StringIndexOutOfBoundsException: Range [10, 8) out of bounds.
+   */
+  public void testOverlappingTokensHighlightExtendedRegion() {
+    String src = "hello world test data";
+
+    List<Toffs> toffsList = new ArrayList<>();
+    toffsList.add(new Toffs(0, 10));
+    toffsList.add(new Toffs(8, 14));
+
+    List<SubInfo> subInfos = new ArrayList<>();
+    subInfos.add(new SubInfo("term", toffsList, 0, 1.0f));
+
+    WeightedFragInfo fragInfo = new WeightedFragInfo(0, 21, subInfos, 1.0f);
+    String result = makeFragment(src, fragInfo);
+
+    // [0,10) highlighted, then [8,14) clipped to [10,14)
+    assertEquals("<b>hello worl</b><b>d te</b>st data", result);
+  }
+
+  /**
+   * Token entirely contained within a previously highlighted region should be skipped without
+   * error.
+   */
+  public void testFullyContainedTokenIsSkipped() {
+    String src = "abcdefghij";
+
+    List<Toffs> toffsList = new ArrayList<>();
+    toffsList.add(new Toffs(0, 8)); // covers [0,8)
+    toffsList.add(new Toffs(2, 5)); // entirely within [0,8)
+
+    List<SubInfo> subInfos = new ArrayList<>();
+    subInfos.add(new SubInfo("term", toffsList, 0, 1.0f));
+
+    WeightedFragInfo fragInfo = new WeightedFragInfo(0, 10, subInfos, 1.0f);
+    String result = makeFragment(src, fragInfo);
+
+    assertEquals("<b>abcdefgh</b>ij", result);
+  }
+
+  /**
+   * CJK max-word segmentation: "中华人民共和国" produces 7 overlapping tokens that collectively cover
+   * [0,7). All characters must be highlighted.
+   */
+  public void testCjkMaxWordOverlappingTokens() {
+    String src = "中华人民共和国";
+
+    List<Toffs> toffsList = new ArrayList<>();
+    toffsList.add(new Toffs(0, 2)); // 中华
+    toffsList.add(new Toffs(0, 4)); // 中华人民
+    toffsList.add(new Toffs(1, 3)); // 华人
+    toffsList.add(new Toffs(2, 4)); // 人民
+    toffsList.add(new Toffs(2, 7)); // 人民共和国
+    toffsList.add(new Toffs(4, 6)); // 共和
+    toffsList.add(new Toffs(4, 7)); // 共和国
+
+    List<SubInfo> subInfos = new ArrayList<>();
+    subInfos.add(new SubInfo("term", toffsList, 0, 7.0f));
+
+    WeightedFragInfo fragInfo = new WeightedFragInfo(0, 7, subInfos, 7.0f);
+    String result = makeFragment(src, fragInfo);
+
+    // Tokens: [0,2) -> "中华", [0,4) clipped to [2,4) -> "人民",
+    // [2,7) clipped to [4,7) -> "共和国". Rest fully contained, skipped.
+    assertEquals("<b>中华</b><b>人民</b><b>共和国</b>", result);
+  }
+
+  /** CJK overlapping tokens with surrounding non-highlighted context. */
+  public void testCjkMaxWordWithContext() {
+    String src = "我爱中华人民共和国万岁";
+
+    List<Toffs> toffsList = new ArrayList<>();
+    toffsList.add(new Toffs(2, 4)); // 中华
+    toffsList.add(new Toffs(2, 6)); // 中华人民
+    toffsList.add(new Toffs(3, 5)); // 华人
+    toffsList.add(new Toffs(4, 6)); // 人民
+    toffsList.add(new Toffs(4, 9)); // 人民共和国
+    toffsList.add(new Toffs(6, 8)); // 共和
+    toffsList.add(new Toffs(6, 9)); // 共和国
+
+    List<SubInfo> subInfos = new ArrayList<>();
+    subInfos.add(new SubInfo("term", toffsList, 0, 7.0f));
+
+    WeightedFragInfo fragInfo = new WeightedFragInfo(0, 11, subInfos, 7.0f);
+    String result = makeFragment(src, fragInfo);
+
+    assertEquals("我爱<b>中华</b><b>人民</b><b>共和国</b>万岁", result);
+  }
+
+  /** Token whose endOffset exceeds source length should be skipped. */
+  public void testTokenExceedingSourceLength() {
+    String src = "short";
+
+    List<Toffs> toffsList = new ArrayList<>();
+    toffsList.add(new Toffs(0, 3)); // valid
+    toffsList.add(new Toffs(3, 99)); // endOffset > src.length()
+
+    List<SubInfo> subInfos = new ArrayList<>();
+    subInfos.add(new SubInfo("term", toffsList, 0, 1.0f));
+
+    WeightedFragInfo fragInfo = new WeightedFragInfo(0, 5, subInfos, 1.0f);
+    String result = makeFragment(src, fragInfo);
+
+    assertEquals("<b>sho</b>rt", result);
+  }
+
+  /** Multiple SubInfos with overlapping offsets across different query terms. */
+  public void testOverlappingAcrossSubInfos() {
+    String src = "the quick brown fox";
+
+    List<Toffs> toffs1 = new ArrayList<>();
+    toffs1.add(new Toffs(4, 9)); // "quick"
+    SubInfo sub1 = new SubInfo("quick", toffs1, 0, 1.0f);
+
+    List<Toffs> toffs2 = new ArrayList<>();
+    toffs2.add(new Toffs(7, 14)); // "ck brow" - overlaps with "quick"
+    SubInfo sub2 = new SubInfo("overlap", toffs2, 1, 1.0f);
+
+    List<SubInfo> subInfos = new ArrayList<>();
+    subInfos.add(sub1);
+    subInfos.add(sub2);
+
+    WeightedFragInfo fragInfo = new WeightedFragInfo(0, 19, subInfos, 2.0f);
+    String result = makeFragment(src, fragInfo);
+
+    // "quick" [4,9) highlighted, then [7,14) clipped to [9,14) = " brow"
+    assertEquals("the <b>quick</b><b> brow</b>n fox", result);
+  }
+
+  /** Inverted offset (endOffset &lt; startOffset) should be skipped without error. */
+  public void testInvertedOffsetsSkipped() {
+    String src = "hello world";
+
+    List<Toffs> toffsList = new ArrayList<>();
+    toffsList.add(new Toffs(5, 3)); // inverted: end < start
+    toffsList.add(new Toffs(0, 5)); // valid
+
+    List<SubInfo> subInfos = new ArrayList<>();
+    subInfos.add(new SubInfo("term", toffsList, 0, 1.0f));
+
+    WeightedFragInfo fragInfo = new WeightedFragInfo(0, 11, subInfos, 1.0f);
+    String result = makeFragment(src, fragInfo);
+
+    // Inverted offset skipped, valid token highlighted
+    assertEquals("<b>hello</b> world", result);
+  }
+}