apache · iemejia · Apr 19, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml
@@ -89,6 +89,18 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <annotationProcessorPaths>
+            <path>
+              <groupId>org.openjdk.jmh</groupId>
+              <artifactId>jmh-generator-annprocess</artifactId>
+              <version>${jmh.version}</version>
+            </path>
+          </annotationProcessorPaths>
+          <annotationProcessors>
+            <annotationProcessor>org.openjdk.jmh.generators.BenchmarkProcessor</annotationProcessor>
+          </annotationProcessors>
+        </configuration>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
@@ -107,6 +119,12 @@
                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                   <mainClass>org.openjdk.jmh.Main</mainClass>
                 </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                  <resource>META-INF/BenchmarkList</resource>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                  <resource>META-INF/CompilerHints</resource>
+                </transformer>
               </transformers>
               <artifactSet>
                 <includes>

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkEncodingUtils.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkEncodingUtils.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+
+/**
+ * Shared helpers for encode/decode micro-benchmarks.
+ */
+final class BenchmarkEncodingUtils {
+
+  private BenchmarkEncodingUtils() {}
+
+  /**
+   * Container for the two artefacts produced by a dictionary-encoded page:
+   * the encoded dictionary indices ({@link #dictData}) and the dictionary
+   * page itself ({@link #dictPage}). The dictionary page may be {@code null}
+   * if the writer fell back to plain encoding (for example, when the
+   * dictionary exceeded its configured maximum size).
+   */
+  static final class EncodedDictionary {
+    final byte[] dictData;
+    final DictionaryPage dictPage;
+
+    EncodedDictionary(byte[] dictData, DictionaryPage dictPage) {
+      this.dictData = dictData;
+      this.dictPage = dictPage;
+    }
+
+    boolean fellBackToPlain() {
+      return dictPage == null;
+    }
+  }
+
+  /**
+   * Drains a {@link DictionaryValuesWriter} into an {@link EncodedDictionary}.
+   *
+   * <p>The writer's data bytes (the RLE-encoded indices) and the dictionary
+   * page are returned separately so both pieces can be measured or fed to a
+   * decoder symmetrically. The dictionary page buffer is copied so it remains
+   * valid after the writer's allocator is released.
+   *
+   * <p>The writer is closed via {@code toDictPageAndClose()}; callers must not
+   * call {@link DictionaryValuesWriter#close()} again afterwards.
+   */
+  static EncodedDictionary drainDictionary(DictionaryValuesWriter writer) throws IOException {
+    byte[] dictData = writer.getBytes().toByteArray();
+    DictionaryPage rawPage = writer.toDictPageAndClose();
+    DictionaryPage dictPage = rawPage == null ? null : rawPage.copy();
+    return new EncodedDictionary(dictData, dictPage);
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BinaryEncodingBenchmark.java
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesReader;
+import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesWriter;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
+import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.PlainValuesDictionary;
+import org.apache.parquet.column.values.plain.BinaryPlainValuesReader;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.apache.parquet.io.api.Binary;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Encoding-level and decoding-level micro-benchmarks for BINARY values.
+ * Compares PLAIN, DELTA_BYTE_ARRAY, DELTA_LENGTH_BYTE_ARRAY, and DICTIONARY encodings
+ * across different string lengths and cardinality patterns.
+ *
+ * <p>Each benchmark invocation processes {@value #VALUE_COUNT} values. Throughput is
+ * reported per-value using {@link OperationsPerInvocation}.
+ *
+ * <p>The dictionary encode/decode benchmarks intentionally measure the full path:
+ * the encoder produces both the RLE-encoded indices and a {@link DictionaryPage};
+ * the decoder consumes the indices through a {@link DictionaryValuesReader} backed
+ * by the same dictionary. If the dictionary exceeds {@link #MAX_DICT_BYTE_SIZE}
+ * (which can happen for high-cardinality, long-string parameter combinations) the
+ * writer falls back to plain encoding and dictionary decoding for that combination
+ * is skipped.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class BinaryEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+  private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024;
+
+  @Param({"10", "100", "1000"})
+  public int stringLength;
+
+  /** LOW = 100 distinct values; HIGH = all unique. */
+  @Param({"LOW", "HIGH"})
+  public String cardinality;
+
+  private Binary[] data;
+  private byte[] plainEncoded;
+  private byte[] deltaLengthEncoded;
+  private byte[] deltaStringsEncoded;
+  private byte[] dictEncoded;
+  private DictionaryPage dictPage;
+  private Dictionary binaryDictionary;
+  private boolean dictionaryAvailable;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    int distinct = "LOW".equals(cardinality) ? TestDataFactory.LOW_CARDINALITY_DISTINCT : 0;
+    data = TestDataFactory.generateBinaryData(VALUE_COUNT, stringLength, distinct, TestDataFactory.DEFAULT_SEED);
+
+    // Pre-encode data for decode benchmarks
+    plainEncoded = encodeBinaryWith(newPlainWriter());
+    deltaLengthEncoded = encodeBinaryWith(newDeltaLengthWriter());
+    deltaStringsEncoded = encodeBinaryWith(newDeltaStringsWriter());
+
+    DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter dictWriter = newDictWriter();
+    for (Binary v : data) {
+      dictWriter.writeBytes(v);
+    }
+    BenchmarkEncodingUtils.EncodedDictionary encoded = BenchmarkEncodingUtils.drainDictionary(dictWriter);
+    dictEncoded = encoded.dictData;
+    dictPage = encoded.dictPage;
+    dictionaryAvailable = !encoded.fellBackToPlain();
+    if (dictionaryAvailable) {
+      binaryDictionary = new PlainValuesDictionary.PlainBinaryDictionary(dictPage);
+    }
+  }
+
+  private byte[] encodeBinaryWith(ValuesWriter writer) throws IOException {
+    for (Binary v : data) {
+      writer.writeBytes(v);
+    }
+    byte[] bytes = writer.getBytes().toByteArray();
+    writer.close();
+    return bytes;
+  }
+
+  private BenchmarkEncodingUtils.EncodedDictionary encodeDictionaryWith(
+      DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter writer) throws IOException {
+    for (Binary v : data) {
+      writer.writeBytes(v);
+    }
+    return BenchmarkEncodingUtils.drainDictionary(writer);
+  }
+
+  // ---- Writer factories ----
+
+  private static PlainValuesWriter newPlainWriter() {
+    return new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DeltaLengthByteArrayValuesWriter newDeltaLengthWriter() {
+    return new DeltaLengthByteArrayValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DeltaByteArrayWriter newDeltaStringsWriter() {
+    return new DeltaByteArrayWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+  }
+
+  private static DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter newDictWriter() {
+    return new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(
+        MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+  }
+
+  // ---- Encode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodePlain() throws IOException {
+    return encodeBinaryWith(newPlainWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDeltaLengthByteArray() throws IOException {
+    return encodeBinaryWith(newDeltaLengthWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDeltaByteArray() throws IOException {
+    return encodeBinaryWith(newDeltaStringsWriter());
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void encodeDictionary(Blackhole bh) throws IOException {
+    BenchmarkEncodingUtils.EncodedDictionary encoded = encodeDictionaryWith(newDictWriter());
+    bh.consume(encoded.dictData);
+    bh.consume(encoded.dictPage);
+  }
+
+  // ---- Decode benchmarks ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodePlain(Blackhole bh) throws IOException {
+    BinaryPlainValuesReader reader = new BinaryPlainValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDeltaLengthByteArray(Blackhole bh) throws IOException {
+    DeltaLengthByteArrayValuesReader reader = new DeltaLengthByteArrayValuesReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaLengthEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDeltaByteArray(Blackhole bh) throws IOException {
+    DeltaByteArrayReader reader = new DeltaByteArrayReader();
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaStringsEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionary(Blackhole bh) throws IOException {
+    if (!dictionaryAvailable) {
+      // Dictionary fell back to plain encoding (e.g. high-cardinality long strings
+      // exceeding MAX_DICT_BYTE_SIZE). Skip to keep the benchmark meaningful.
+      return;
+    }
+    DictionaryValuesReader reader = new DictionaryValuesReader(binaryDictionary);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(dictEncoded)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readBytes());
+    }
+  }
+}