apache · costin · Jun 23, 2026 · Jun 23, 2026 · Jun 24, 2026 · Jun 26, 2026
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -279,7 +279,9 @@ Other
 
 API Changes
 ---------------------
-(No changes)
+
+* GITHUB#16286: Introduce BinaryDocValues#binaryValues to help speed up the
+  retrieval of many binary doc values at once. (Costin Leau)
 
 New Features
 ---------------------

diff --git a/...mark-jmh/src/java/org/apache/lucene/benchmark/jmh/BinaryDocValuesBulkDecodeBenchmark.java b/...mark-jmh/src/java/org/apache/lucene/benchmark/jmh/BinaryDocValuesBulkDecodeBenchmark.java
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.jmh;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Comparator;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Stream;
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.util.BytesRef;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Benchmarks bulk retrieval of dense binary doc values via {@link BinaryDocValues#binaryValues}.
+ * Compares the per-doc default with the Lucene90 codec override that reads directly from the data
+ * slice.
+ */
+@State(Scope.Thread)
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+public class BinaryDocValuesBulkDecodeBenchmark {
+
+  private Directory dir;
+  private DirectoryReader reader;
+  private BinaryDocValues values;
+  private Path path;
+  private int[] docs;
+  private BytesRef[] valueBuffer;
+  private int nextStart;
+
+  @Param({"1000000"})
+  public int docCount;
+
+  @Param({"8", "32", "128"})
+  public int valueLength;
+
+  @Param({"128", "1024"})
+  public int batchSize;
+
+  @Param({"fixed", "variable"})
+  public String encoding;
+
+  @Setup(Level.Trial)
+  public void setup() throws Exception {
+    path = Files.createTempDirectory("binaryDocValuesBulkDecode");
+    dir = MMapDirectory.open(path);
+
+    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
+    Random random = new Random(0);
+    for (int i = 0; i < docCount; i++) {
+      Document doc = new Document();
+      int len;
+      if (encoding.equals("fixed")) {
+        len = valueLength;
+      } else {
+        len = 1 + random.nextInt(valueLength);
+      }
+      byte[] bytes = new byte[len];
+      random.nextBytes(bytes);
+      doc.add(new BinaryDocValuesField("field", new BytesRef(bytes)));
+      writer.addDocument(doc);
+    }
+    writer.forceMerge(1);
+    reader = DirectoryReader.open(writer);
+    writer.close();
+
+    values = reader.leaves().get(0).reader().getBinaryDocValues("field");
+    docs = new int[batchSize];
+    valueBuffer = new BytesRef[batchSize];
+  }
+
+  @TearDown(Level.Trial)
+  public void tearDown() throws Exception {
+    reader.close();
+    dir.close();
+    if (Files.exists(path)) {
+      try (Stream<Path> walk = Files.walk(path)) {
+        walk.sorted(Comparator.reverseOrder())
+            .forEach(
+                p -> {
+                  try {
+                    Files.delete(p);
+                  } catch (IOException _) {
+                  }
+                });
+      }
+    }
+  }
+
+  @Benchmark
+  @Fork(
+      value = 1,
+      jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"})
+  public int binaryValuesBulk() throws IOException {
+    return readBatchBulk();
+  }
+
+  @Benchmark
+  @Fork(
+      value = 1,
+      jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"})
+  public int binaryValuesPerDoc() throws IOException {
+    return readBatchPerDoc();
+  }
+
+  private int readBatchBulk() throws IOException {
+    final int maxStart = docCount - batchSize;
+    if (nextStart > maxStart) {
+      nextStart = 0;
+    }
+    for (int i = 0; i < batchSize; i++) {
+      docs[i] = nextStart + i;
+    }
+    nextStart += batchSize;
+
+    values.binaryValues(batchSize, docs, valueBuffer);
+    int checksum = 0;
+    for (int i = 0; i < batchSize; i++) {
+      checksum += valueBuffer[i].length;
+    }
+    return checksum;
+  }
+
+  private int readBatchPerDoc() throws IOException {
+    final int maxStart = docCount - batchSize;
+    if (nextStart > maxStart) {
+      nextStart = 0;
+    }
+
+    int checksum = 0;
+    for (int i = 0; i < batchSize; i++) {
+      int doc = nextStart + i;
+      values.advanceExact(doc);
+      BytesRef ref = BytesRef.deepCopyOf(values.binaryValue());
+      checksum += ref.length;
+    }
+    nextStart += batchSize;
+    return checksum;
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
@@ -1230,6 +1230,22 @@ public BytesRef binaryValue() throws IOException {
             bytesSlice.readBytes((long) doc * length, bytes.bytes, 0, length);
             return bytes;
           }
+
+          @Override
+          public void binaryValues(
+              int size, int[] docs, int docsOffset, BytesRef[] values, int valuesOffset)
+              throws IOException {
+            for (int di = docsOffset, vi = valuesOffset, end = docsOffset + size;
+                di < end;
+                di++, vi++) {
+              byte[] b = new byte[length];
+              bytesSlice.readBytes((long) docs[di] * length, b, 0, length);
+              values[vi] = new BytesRef(b, 0, length);
+            }
+            if (size != 0) {
+              doc = docs[docsOffset + size - 1];
+            }
+          }
         };
       } else {
         // variable length
@@ -1252,6 +1268,25 @@ public BytesRef binaryValue() throws IOException {
             bytesSlice.readBytes(startOffset, bytes.bytes, 0, bytes.length);
             return bytes;
           }
+
+          @Override
+          public void binaryValues(
+              int size, int[] docs, int docsOffset, BytesRef[] values, int valuesOffset)
+              throws IOException {
+            for (int di = docsOffset, vi = valuesOffset, end = docsOffset + size;
+                di < end;
+                di++, vi++) {
+              int d = docs[di];
+              long startOffset = addresses.get(d);
+              int len = (int) (addresses.get(d + 1L) - startOffset);
+              byte[] b = new byte[len];
+              bytesSlice.readBytes(startOffset, b, 0, len);
+              values[vi] = new BytesRef(b, 0, len);
+            }
+            if (size != 0) {
+              doc = docs[docsOffset + size - 1];
+            }
+          }
         };
       }
     } else {

diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValues.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValues.java
@@ -18,9 +18,11 @@
 package org.apache.lucene.index;
 
 import java.io.IOException;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.FieldExistsQuery;
 import org.apache.lucene.util.BytesRef;
 
-/** A per-document numeric value. */
+/** A per-document binary value. */
 public abstract class BinaryDocValues extends DocValuesIterator {
 
   /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
@@ -33,4 +35,76 @@ protected BinaryDocValues() {}
    * @return binary value
    */
   public abstract BytesRef binaryValue() throws IOException;
+
+  /**
+   * Bulk retrieval of binary doc values. This API helps reduce the performance impact of virtual
+   * function calls.
+   *
+   * <p>This API behaves as if implemented as below, which is the default implementation:
+   *
+   * <pre><code class="language-java">
+   * public void binaryValues(int size, int[] docs, BytesRef[] values) throws IOException {
+   *   for (int i = 0; i &lt; size; ++i) {
+   *     int doc = docs[i];
+   *     if (advanceExact(doc)) {
+   *       values[i] = BytesRef.deepCopyOf(binaryValue());
+   *     } else {
+   *       values[i] = null;
+   *     }
+   *   }
+   * }
+   * </code></pre>
+   *
+   * <p><b>NOTE</b>: The {@code docs} array is required to be sorted in ascending order with no
+   * duplicates.
+   *
+   * <p><b>NOTE</b>: Documents that don't have a value for this field will have their corresponding
+   * entry set to {@code null}. If you need to exclude documents that don't have a value, then you
+   * could apply a {@link FieldExistsQuery} as a {@link Occur#FILTER} clause. Another option is to
+   * fall back to using {@link #advanceExact} and {@link #binaryValue()} on ranges of doc IDs that
+   * may not be dense, e.g.
+   *
+   * <pre><code class="language-java">
+   * if (size &gt; 0 &amp;&amp; values.advanceExact(docs[0]) &amp;&amp; values.docIDRunEnd() &gt; docs[size - 1]) {
+   *   // use values#binaryValues to retrieve values
+   * } else {
+   *   // some docs may not have a value, use #advanceExact and #binaryValue
+   * }
+   * </code></pre>
+   *
+   * <p><b>NOTE</b>: Each returned {@link BytesRef} is a deep copy owned by the caller and remains
+   * valid after subsequent calls.
+   *
+   * @param size the number of values to retrieve
+   * @param docs the buffer of doc IDs whose values should be looked up
+   * @param values the buffer of values to fill; entries are set to {@code null} when a document
+   *     doesn't have a value
+   */
+  public void binaryValues(int size, int[] docs, BytesRef[] values) throws IOException {
+    binaryValues(size, docs, 0, values, 0);
+  }
+
+  /**
+   * Offset-aware variant of {@link #binaryValues(int, int[], BytesRef[])}. Reads {@code size} doc
+   * IDs starting at {@code docs[docsOffset]} and writes the corresponding values starting at {@code
+   * values[valuesOffset]}. This follows the same convention as {@link System#arraycopy}.
+   *
+   * @param size the number of values to retrieve
+   * @param docs the buffer of doc IDs whose values should be looked up
+   * @param docsOffset first position in {@code docs} to read
+   * @param values the buffer of values to fill; entries are set to {@code null} when a document
+   *     doesn't have a value
+   * @param valuesOffset first position in {@code values} to write
+   */
+  public void binaryValues(
+      int size, int[] docs, int docsOffset, BytesRef[] values, int valuesOffset)
+      throws IOException {
+    for (int di = docsOffset, vi = valuesOffset, end = docsOffset + size; di < end; di++, vi++) {
+      if (advanceExact(docs[di])) {
+        values[vi] = BytesRef.deepCopyOf(binaryValue());
+      } else {
+        values[vi] = null;
+      }
+    }
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -4113,6 +4113,49 @@ private static void checkBulkFetchNumericDocValues(
     }
   }
 
+  private static void checkBulkFetchBinaryDocValues(
+      String fieldName, BinaryDocValues bdv, BinaryDocValues bdv2, int maxDoc) throws IOException {
+
+    int[] docs = new int[16];
+    BytesRef[] values = new BytesRef[16];
+
+    for (int doc = -1; doc < maxDoc; ) {
+      int size = 0;
+      for (int j = 0; j < docs.length; ++j) {
+        doc += 1 + (j & 0x03);
+        if (doc >= maxDoc) {
+          break;
+        }
+        docs[size++] = doc;
+      }
+
+      bdv.binaryValues(size, docs, values);
+
+      for (int j = 0; j < size; ++j) {
+        if (bdv2.advanceExact(docs[j])) {
+          BytesRef expected = BytesRef.deepCopyOf(bdv2.binaryValue());
+          if (values[j] == null || values[j].equals(expected) == false) {
+            throw new CheckIndexException(
+                "field "
+                    + fieldName
+                    + " #binaryValues reports different value: "
+                    + values[j]
+                    + " != "
+                    + expected);
+          }
+        } else {
+          if (values[j] != null) {
+            throw new CheckIndexException(
+                "field "
+                    + fieldName
+                    + " #binaryValues reports non-null for missing doc: "
+                    + values[j]);
+          }
+        }
+      }
+    }
+  }
+
   private static void checkDocValues(
       FieldInfo fi, int maxDoc, DocValuesProducer dvReader, DocValuesStatus status)
       throws Exception {
@@ -4141,6 +4184,8 @@ private static void checkDocValues(
         status.totalBinaryFields++;
         checkDVIterator(fi, dvReader::getBinary);
         checkBinaryDocValues(fi.name, dvReader.getBinary(fi), dvReader.getBinary(fi));
+        checkBulkFetchBinaryDocValues(
+            fi.name, dvReader.getBinary(fi), dvReader.getBinary(fi), maxDoc);
         break;
       case NUMERIC:
         status.totalNumericFields++;