From 3574a5e4357c61533687d646d44681a8c94def7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Sun, 19 Apr 2026 19:35:49 +0000 Subject: [PATCH] GH-3509: Optimize BinaryPlainValuesReader by reading directly from ByteBuffer BinaryPlainValuesReader.readBytes is the hot-path decoder for BINARY (and STRING) columns using PLAIN encoding. The current implementation funnels every length read through BytesUtils.readIntLittleEndian(InputStream), which calls in.read() four times with full IOException plumbing and virtual dispatch on ByteBufferInputStream, and slices every value through a virtual ByteBufferInputStream.slice(int). This change replaces the ByteBufferInputStream field with a single ByteBuffer set up once in initFromPage. The length prefix is then a single ByteBuffer.getInt() (one bounds check, JIT-friendly little-endian intrinsic, no IOException plumbing) and each value slice is a direct ByteBuffer.slice() instead of a virtual ByteBufferInputStream.slice(int). When the input is a MultiBufferInputStream the upfront stream.slice(available) call may consolidate the page into a single fresh ByteBuffer. This is one allocation per page in exchange for inlined per-value reads, which is a clear win whenever the page contains more than a handful of values. Benchmark (BinaryEncodingBenchmark.decodePlain, 100k values per invocation, JDK 18, JMH -wi 5 -i 10 -f 3, 30 samples per row): cardinality stringLen Before (ops/s) After (ops/s) Improvement HIGH 10 23,114,969 27,126,384 +17.4% (1.17x) HIGH 100 20,516,861 22,200,091 +8.2% (1.08x) HIGH 1000 7,069,927 7,679,070 +8.6% (1.09x) LOW 10 22,885,778 26,459,404 +15.6% (1.16x) LOW 100 20,349,900 22,158,675 +8.9% (1.09x) LOW 1000 6,279,616 7,500,811 +19.4% (1.19x) Per-op allocation is unchanged (~88 B/op = the returned Binary + the per-value ByteBuffer slice). The improvement is largest at small string lengths because the per-value fixed cost dominates more there. All 573 parquet-column tests pass. --- .../values/plain/BinaryPlainValuesReader.java | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BinaryPlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BinaryPlainValuesReader.java index 6ce2f31a43..c0532f2961 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BinaryPlainValuesReader.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/BinaryPlainValuesReader.java @@ -19,36 +19,38 @@ package org.apache.parquet.column.values.plain; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import org.apache.parquet.bytes.ByteBufferInputStream; -import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.values.ValuesReader; -import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.api.Binary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * Plain encoding reader for BINARY values. + * + *

Reads directly from a {@link ByteBuffer} with {@link ByteOrder#LITTLE_ENDIAN} byte order, + * using {@link ByteBuffer#getInt()} for the 4-byte length prefix instead of 4 individual + * {@code InputStream.read()} calls through {@link org.apache.parquet.bytes.BytesUtils#readIntLittleEndian}. + */ public class BinaryPlainValuesReader extends ValuesReader { private static final Logger LOG = LoggerFactory.getLogger(BinaryPlainValuesReader.class); - private ByteBufferInputStream in; + private ByteBuffer buffer; @Override public Binary readBytes() { - try { - int length = BytesUtils.readIntLittleEndian(in); - return Binary.fromConstantByteBuffer(in.slice(length)); - } catch (IOException | RuntimeException e) { - throw new ParquetDecodingException("could not read bytes at offset " + in.position(), e); - } + int length = buffer.getInt(); + ByteBuffer valueSlice = buffer.slice(); + valueSlice.limit(length); + buffer.position(buffer.position() + length); + return Binary.fromConstantByteBuffer(valueSlice); } @Override public void skip() { - try { - int length = BytesUtils.readIntLittleEndian(in); - in.skipFully(length); - } catch (IOException | RuntimeException e) { - throw new ParquetDecodingException("could not skip bytes at offset " + in.position(), e); - } + int length = buffer.getInt(); + buffer.position(buffer.position() + length); } @Override @@ -57,6 +59,11 @@ public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IO "init from page at offset {} for length {}", stream.position(), (stream.available() - stream.position())); - this.in = stream.remainingStream(); + int available = stream.available(); + if (available > 0) { + this.buffer = stream.slice(available).order(ByteOrder.LITTLE_ENDIAN); + } else { + this.buffer = ByteBuffer.allocate(0).order(ByteOrder.LITTLE_ENDIAN); + } } }