From bea837f989be4f680a5adc04ab6145b8fb962ff0 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Wed, 15 Apr 2026 17:02:25 -0700
Subject: [PATCH 1/5] Adding Claude implementation and benchmark.

---
 .../mumbling/PFOREncodingBenchmark.java       | 200 +++++++
 .../apache/iceberg/mumbling/BitPacking.java   | 499 ++++++++++++++++++
 .../apache/iceberg/mumbling/PFOREncoding.java | 350 ++++++++++++
 .../mumbling/PFOREncodingTestUtils.java       |  66 +++
 .../iceberg/mumbling/TestPFOREncoding.java    | 163 ++++++
 .../mumbling/TestPFOREncodingRandom.java      | 169 ++++++
 gradle/libs.versions.toml                     |   2 +
 jmh.gradle                                    |   6 +
 8 files changed, 1455 insertions(+)
 create mode 100644 core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java
 create mode 100644 core/src/main/java/org/apache/iceberg/mumbling/BitPacking.java
 create mode 100644 core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java
 create mode 100644 core/src/test/java/org/apache/iceberg/mumbling/PFOREncodingTestUtils.java
 create mode 100644 core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncoding.java
 create mode 100644 core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java
diff --git a/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java b/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java
new file mode 100644
index 000000000000..3c6a21d658e4
--- /dev/null
+++ b/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.mumbling;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+import me.lemire.integercompression.FastPFOR128;
+import me.lemire.integercompression.IntWrapper;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Timeout;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * A benchmark that evaluates the performance of {@link PFOREncoding} and compares it with
+ * JavaFastPFOR.
+ *
+ * <p>Two data shapes are exercised:
+ *
+ * <ul>
+ *   <li><b>Descriptor</b>: 256 values in {@code [0, 31]} with 5% full-range outliers, typical for
+ *       Mumbling bitmap descriptor arrays.
+ *   <li><b>Uniform byte</b>: 256 values drawn uniformly from {@code [0, 255]}, the worst case for
+ *       PFOR (no compression benefit; exercises the fallback path).
+ * </ul>
+ *
+ * <p>To run this benchmark: <code>
+ *   ./gradlew :iceberg-core:jmh
+ *       -PjmhIncludeRegex=PFOREncodingBenchmark
+ *       -PjmhOutputPath=benchmark/pfor-encoding-benchmark.txt
+ * </code>
+ */
+@Fork(1)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+@BenchmarkMode(Mode.Throughput)
+@Timeout(time = 5, timeUnit = TimeUnit.MINUTES)
+public class PFOREncodingBenchmark {
+
+  // Fixed seeds for reproducibility
+  private static final long DESCRIPTOR_SEED = 0x5a5a5a5a5a5a5a5aL;
+  private static final long UNIFORM_SEED = 0xa1b2c3d4e5f60708L;
+
+  // Iceberg PFOR input arrays
+  private int[] descriptorValues;
+  private int[] uniformValues;
+
+  // Pre-encoded buffers for decode benchmarks
+  private ByteBuffer descriptorEncoded;
+  private ByteBuffer uniformEncoded;
+
+  // Reusable encode buffer (avoids allocation in the encode hot path)
+  private ByteBuffer encodeBuffer;
+
+  // JavaFastPFOR compressed arrays (pre-encoded for decode benchmarks)
+  private int[] descriptorFastPFOREncoded;
+  private int[] uniformFastPFOREncoded;
+
+  // Reusable output buffers for JavaFastPFOR (avoids allocation in hot path)
+  private int[] fastPFOROutputBuffer;
+
+  @Setup
+  public void setupBenchmark() {
+    // 256-value descriptor-like data: mostly [0,31] with ~5% [0,255] outliers
+    descriptorValues = PFOREncodingTestUtils.sparse(256, DESCRIPTOR_SEED, 5);
+
+    // 256-value uniform byte data
+    uniformValues = PFOREncodingTestUtils.uniform(256, UNIFORM_SEED, 255);
+
+    // Reusable encode buffer: worst-case for a single 256-value chunk
+    encodeBuffer = ByteBuffer.allocate(3 + 256);
+
+    // Pre-encode for decode benchmarks
+    descriptorEncoded = PFOREncoding.encode(descriptorValues);
+    uniformEncoded = PFOREncoding.encode(uniformValues);
+
+    // Pre-encode with JavaFastPFOR for decode benchmarks
+    FastPFOR128 codec = new FastPFOR128();
+    descriptorFastPFOREncoded = fastPFOREncode(codec, descriptorValues);
+    uniformFastPFOREncoded = fastPFOREncode(codec, uniformValues);
+
+    // Output buffer large enough for any 256-value decoded result
+    fastPFOROutputBuffer = new int[256 + 1024];
+  }
+
+  // ---------------------------------------------------------------------------
+  // Iceberg PFOR — descriptor data shape
+  // ---------------------------------------------------------------------------
+
+  @Benchmark
+  @Threads(1)
+  public void encodeDescriptorIceberg(Blackhole blackhole) {
+    blackhole.consume(PFOREncoding.encode(descriptorValues, descriptorValues.length, encodeBuffer));
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void decodeDescriptorIceberg(Blackhole blackhole) {
+    blackhole.consume(PFOREncoding.decode(descriptorEncoded, descriptorValues.length));
+  }
+
+  // ---------------------------------------------------------------------------
+  // Iceberg PFOR — uniform byte data shape
+  // ---------------------------------------------------------------------------
+
+  @Benchmark
+  @Threads(1)
+  public void encodeUniformIceberg(Blackhole blackhole) {
+    blackhole.consume(PFOREncoding.encode(uniformValues, uniformValues.length, encodeBuffer));
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void decodeUniformIceberg(Blackhole blackhole) {
+    blackhole.consume(PFOREncoding.decode(uniformEncoded, uniformValues.length));
+  }
+
+  // ---------------------------------------------------------------------------
+  // JavaFastPFOR — descriptor data shape
+  // ---------------------------------------------------------------------------
+
+  @Benchmark
+  @Threads(1)
+  public void encodeDescriptorFastPFOR(Blackhole blackhole) {
+    FastPFOR128 codec = new FastPFOR128();
+    blackhole.consume(fastPFOREncode(codec, descriptorValues));
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void decodeDescriptorFastPFOR(Blackhole blackhole) {
+    FastPFOR128 codec = new FastPFOR128();
+    blackhole.consume(fastPFORDecode(codec, descriptorFastPFOREncoded, descriptorValues.length));
+  }
+
+  // ---------------------------------------------------------------------------
+  // JavaFastPFOR — uniform byte data shape
+  // ---------------------------------------------------------------------------
+
+  @Benchmark
+  @Threads(1)
+  public void encodeUniformFastPFOR(Blackhole blackhole) {
+    FastPFOR128 codec = new FastPFOR128();
+    blackhole.consume(fastPFOREncode(codec, uniformValues));
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void decodeUniformFastPFOR(Blackhole blackhole) {
+    FastPFOR128 codec = new FastPFOR128();
+    blackhole.consume(fastPFORDecode(codec, uniformFastPFOREncoded, uniformValues.length));
+  }
+
+  // ---------------------------------------------------------------------------
+  // JavaFastPFOR helpers
+  // ---------------------------------------------------------------------------
+
+  private static int[] fastPFOREncode(FastPFOR128 codec, int[] values) {
+    int[] output = new int[values.length + 1024];
+    IntWrapper inPos = new IntWrapper(0);
+    IntWrapper outPos = new IntWrapper(0);
+    codec.compress(values, inPos, values.length, output, outPos);
+    int[] result = new int[outPos.get()];
+    System.arraycopy(output, 0, result, 0, outPos.get());
+    return result;
+  }
+
+  private static int[] fastPFORDecode(FastPFOR128 codec, int[] encoded, int count) {
+    int[] output = new int[count];
+    IntWrapper inPos = new IntWrapper(0);
+    IntWrapper outPos = new IntWrapper(0);
+    codec.uncompress(encoded, inPos, encoded.length, output, outPos);
+    return output;
+  }
+}
diff --git a/core/src/main/java/org/apache/iceberg/mumbling/BitPacking.java b/core/src/main/java/org/apache/iceberg/mumbling/BitPacking.java
new file mode 100644
index 000000000000..9333ac2c24c8
--- /dev/null
+++ b/core/src/main/java/org/apache/iceberg/mumbling/BitPacking.java
@@ -0,0 +1,499 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.mumbling;
+
+import java.nio.ByteBuffer;
+
+/**
+ * MSB-first bit packing and unpacking for values of 1–7 bits.
+ *
+ * <p>Values are packed in groups of 8: each group of 8 values occupies exactly {@code b} bytes.
+ * A trailing partial group of {@code rem = count & 7} values is left-aligned in {@code
+ * ceil(rem * b / 8)} bytes, padded with zero bits.
+ *
+ * <p>Each specialized method (b=1..7) has a compile-time-constant bit width, allowing the JIT to
+ * fold shift amounts to immediates and inline the byte sequence, eliminating the inner loop and
+ * its loop-carried shift dependency.
+ */
+class BitPacking {
+
+  private BitPacking() {}
+
+  /**
+   * Packs {@code count} values from {@code values[valPos..]} into {@code out} starting at absolute
+   * position {@code outPos}, using {@code b} bits per value (1–7).
+   */
+  static void packBits(
+      int[] values, int valPos, int count, ByteBuffer out, int outPos, int b) {
+    switch (b) {
+      case 1:
+        packBits1(values, valPos, count, out, outPos);
+        break;
+      case 2:
+        packBits2(values, valPos, count, out, outPos);
+        break;
+      case 3:
+        packBits3(values, valPos, count, out, outPos);
+        break;
+      case 4:
+        packBits4(values, valPos, count, out, outPos);
+        break;
+      case 5:
+        packBits5(values, valPos, count, out, outPos);
+        break;
+      case 6:
+        packBits6(values, valPos, count, out, outPos);
+        break;
+      case 7:
+        packBits7(values, valPos, count, out, outPos);
+        break;
+      default:
+        throw new IllegalArgumentException("Invalid bit width: " + b);
+    }
+  }
+
+  /**
+   * Unpacks {@code count} values from {@code data} starting at absolute position {@code dataPos}
+   * into {@code output}, using {@code b} bits per value (1–7).
+   */
+  static void unpackBits(ByteBuffer data, int dataPos, int[] output, int count, int b) {
+    switch (b) {
+      case 1:
+        unpackBits1(data, dataPos, output, count);
+        break;
+      case 2:
+        unpackBits2(data, dataPos, output, count);
+        break;
+      case 3:
+        unpackBits3(data, dataPos, output, count);
+        break;
+      case 4:
+        unpackBits4(data, dataPos, output, count);
+        break;
+      case 5:
+        unpackBits5(data, dataPos, output, count);
+        break;
+      case 6:
+        unpackBits6(data, dataPos, output, count);
+        break;
+      case 7:
+        unpackBits7(data, dataPos, output, count);
+        break;
+      default:
+        throw new IllegalArgumentException("Invalid bit width: " + b);
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Specialized pack: b=1..7
+  // Each method packs 8 values into exactly b bytes (full groups), plus a
+  // partial group of rem < 8 values left-aligned in ceil(rem*b/8) bytes.
+  // ---------------------------------------------------------------------------
+
+  /** 1-bit values: 8 values → 1 byte. */
+  private static void packBits1(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int vBase = valPos + (g << 3);
+      out.put(
+          outPos + g,
+          (byte)
+              (((values[vBase] & 1) << 7)
+                  | ((values[vBase + 1] & 1) << 6)
+                  | ((values[vBase + 2] & 1) << 5)
+                  | ((values[vBase + 3] & 1) << 4)
+                  | ((values[vBase + 4] & 1) << 3)
+                  | ((values[vBase + 5] & 1) << 2)
+                  | ((values[vBase + 6] & 1) << 1)
+                  | (values[vBase + 7] & 1)));
+    }
+    int rem = count & 7;
+    if (rem > 0) {
+      int vBase = valPos + (fullGroups << 3);
+      long word = 0;
+      for (int k = 0; k < rem; k += 1) {
+        word = (word << 1) | (values[vBase + k] & 1);
+      }
+      out.put(outPos + fullGroups, (byte) (word << (8 - rem)));
+    }
+  }
+
+  /** 2-bit values: 8 values → 2 bytes. */
+  private static void packBits2(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int vBase = valPos + (g << 3);
+      int oBase = outPos + (g << 1);
+      int word =
+          ((values[vBase] & 3) << 14)
+              | ((values[vBase + 1] & 3) << 12)
+              | ((values[vBase + 2] & 3) << 10)
+              | ((values[vBase + 3] & 3) << 8)
+              | ((values[vBase + 4] & 3) << 6)
+              | ((values[vBase + 5] & 3) << 4)
+              | ((values[vBase + 6] & 3) << 2)
+              | (values[vBase + 7] & 3);
+      out.put(oBase, (byte) (word >>> 8));
+      out.put(oBase + 1, (byte) word);
+    }
+    packRemainder(values, valPos, count, out, outPos, fullGroups, 2, 3);
+  }
+
+  /** 3-bit values: 8 values → 3 bytes. */
+  private static void packBits3(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int vBase = valPos + (g << 3);
+      int oBase = outPos + g * 3;
+      int word =
+          ((values[vBase] & 7) << 21)
+              | ((values[vBase + 1] & 7) << 18)
+              | ((values[vBase + 2] & 7) << 15)
+              | ((values[vBase + 3] & 7) << 12)
+              | ((values[vBase + 4] & 7) << 9)
+              | ((values[vBase + 5] & 7) << 6)
+              | ((values[vBase + 6] & 7) << 3)
+              | (values[vBase + 7] & 7);
+      out.put(oBase, (byte) (word >>> 16));
+      out.put(oBase + 1, (byte) (word >>> 8));
+      out.put(oBase + 2, (byte) word);
+    }
+    packRemainder(values, valPos, count, out, outPos, fullGroups, 3, 7);
+  }
+
+  /** 4-bit values: 8 values → 4 bytes. */
+  private static void packBits4(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int vBase = valPos + (g << 3);
+      int oBase = outPos + (g << 2);
+      long word =
+          ((long) (values[vBase] & 15) << 28)
+              | ((long) (values[vBase + 1] & 15) << 24)
+              | ((long) (values[vBase + 2] & 15) << 20)
+              | ((long) (values[vBase + 3] & 15) << 16)
+              | ((long) (values[vBase + 4] & 15) << 12)
+              | ((long) (values[vBase + 5] & 15) << 8)
+              | ((long) (values[vBase + 6] & 15) << 4)
+              | (long) (values[vBase + 7] & 15);
+      out.put(oBase, (byte) (word >>> 24));
+      out.put(oBase + 1, (byte) (word >>> 16));
+      out.put(oBase + 2, (byte) (word >>> 8));
+      out.put(oBase + 3, (byte) word);
+    }
+    packRemainder(values, valPos, count, out, outPos, fullGroups, 4, 15);
+  }
+
+  /** 5-bit values: 8 values → 5 bytes. */
+  private static void packBits5(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int vBase = valPos + (g << 3);
+      int oBase = outPos + g * 5;
+      long word =
+          ((long) (values[vBase] & 31) << 35)
+              | ((long) (values[vBase + 1] & 31) << 30)
+              | ((long) (values[vBase + 2] & 31) << 25)
+              | ((long) (values[vBase + 3] & 31) << 20)
+              | ((long) (values[vBase + 4] & 31) << 15)
+              | ((long) (values[vBase + 5] & 31) << 10)
+              | ((long) (values[vBase + 6] & 31) << 5)
+              | (long) (values[vBase + 7] & 31);
+      out.put(oBase, (byte) (word >>> 32));
+      out.put(oBase + 1, (byte) (word >>> 24));
+      out.put(oBase + 2, (byte) (word >>> 16));
+      out.put(oBase + 3, (byte) (word >>> 8));
+      out.put(oBase + 4, (byte) word);
+    }
+    packRemainder(values, valPos, count, out, outPos, fullGroups, 5, 31);
+  }
+
+  /** 6-bit values: 8 values → 6 bytes. */
+  private static void packBits6(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int vBase = valPos + (g << 3);
+      int oBase = outPos + g * 6;
+      long word =
+          ((long) (values[vBase] & 63) << 42)
+              | ((long) (values[vBase + 1] & 63) << 36)
+              | ((long) (values[vBase + 2] & 63) << 30)
+              | ((long) (values[vBase + 3] & 63) << 24)
+              | ((long) (values[vBase + 4] & 63) << 18)
+              | ((long) (values[vBase + 5] & 63) << 12)
+              | ((long) (values[vBase + 6] & 63) << 6)
+              | (long) (values[vBase + 7] & 63);
+      out.put(oBase, (byte) (word >>> 40));
+      out.put(oBase + 1, (byte) (word >>> 32));
+      out.put(oBase + 2, (byte) (word >>> 24));
+      out.put(oBase + 3, (byte) (word >>> 16));
+      out.put(oBase + 4, (byte) (word >>> 8));
+      out.put(oBase + 5, (byte) word);
+    }
+    packRemainder(values, valPos, count, out, outPos, fullGroups, 6, 63);
+  }
+
+  /** 7-bit values: 8 values → 7 bytes. */
+  private static void packBits7(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int vBase = valPos + (g << 3);
+      int oBase = outPos + g * 7;
+      long word =
+          ((long) (values[vBase] & 127) << 49)
+              | ((long) (values[vBase + 1] & 127) << 42)
+              | ((long) (values[vBase + 2] & 127) << 35)
+              | ((long) (values[vBase + 3] & 127) << 28)
+              | ((long) (values[vBase + 4] & 127) << 21)
+              | ((long) (values[vBase + 5] & 127) << 14)
+              | ((long) (values[vBase + 6] & 127) << 7)
+              | (long) (values[vBase + 7] & 127);
+      out.put(oBase, (byte) (word >>> 48));
+      out.put(oBase + 1, (byte) (word >>> 40));
+      out.put(oBase + 2, (byte) (word >>> 32));
+      out.put(oBase + 3, (byte) (word >>> 24));
+      out.put(oBase + 4, (byte) (word >>> 16));
+      out.put(oBase + 5, (byte) (word >>> 8));
+      out.put(oBase + 6, (byte) word);
+    }
+    packRemainder(values, valPos, count, out, outPos, fullGroups, 7, 127);
+  }
+
+  /**
+   * Packs the final partial group (rem = count & 7 values) for bit widths 2–7. Values are
+   * left-aligned in {@code ceil(rem * bitsPerValue / 8)} bytes.
+   */
+  private static void packRemainder(
+      int[] values,
+      int valPos,
+      int count,
+      ByteBuffer out,
+      int outPos,
+      int fullGroups,
+      int bitsPerValue,
+      int mask) {
+    int rem = count & 7;
+    if (rem > 0) {
+      int vBase = valPos + (fullGroups << 3);
+      int oBase = outPos + fullGroups * bitsPerValue;
+      long word = 0;
+      for (int k = 0; k < rem; k += 1) {
+        word = (word << bitsPerValue) | (values[vBase + k] & mask);
+      }
+      int remBits = rem * bitsPerValue;
+      int remBytes = (remBits + 7) >>> 3;
+      word <<= (remBytes << 3) - remBits;
+      for (int k = remBytes - 1; k >= 0; k--) {
+        out.put(oBase + k, (byte) word);
+        word >>>= 8;
+      }
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Specialized unpack: b=1..7
+  // ---------------------------------------------------------------------------
+
+  /** 1-bit values: 1 byte → 8 values. */
+  private static void unpackBits1(ByteBuffer data, int dataPos, int[] output, int count) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int oBase = g << 3;
+      int w = data.get(dataPos + g) & 0xFF;
+      output[oBase] = (w >>> 7) & 1;
+      output[oBase + 1] = (w >>> 6) & 1;
+      output[oBase + 2] = (w >>> 5) & 1;
+      output[oBase + 3] = (w >>> 4) & 1;
+      output[oBase + 4] = (w >>> 3) & 1;
+      output[oBase + 5] = (w >>> 2) & 1;
+      output[oBase + 6] = (w >>> 1) & 1;
+      output[oBase + 7] = w & 1;
+    }
+    unpackRemainder(data, dataPos, output, count, fullGroups, 1, 1);
+  }
+
+  /** 2-bit values: 2 bytes → 8 values. */
+  private static void unpackBits2(ByteBuffer data, int dataPos, int[] output, int count) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int oBase = g << 3;
+      int dBase = dataPos + (g << 1);
+      int w = ((data.get(dBase) & 0xFF) << 8) | (data.get(dBase + 1) & 0xFF);
+      output[oBase] = (w >>> 14) & 3;
+      output[oBase + 1] = (w >>> 12) & 3;
+      output[oBase + 2] = (w >>> 10) & 3;
+      output[oBase + 3] = (w >>> 8) & 3;
+      output[oBase + 4] = (w >>> 6) & 3;
+      output[oBase + 5] = (w >>> 4) & 3;
+      output[oBase + 6] = (w >>> 2) & 3;
+      output[oBase + 7] = w & 3;
+    }
+    unpackRemainder(data, dataPos, output, count, fullGroups, 2, 3);
+  }
+
+  /** 3-bit values: 3 bytes → 8 values. */
+  private static void unpackBits3(ByteBuffer data, int dataPos, int[] output, int count) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int oBase = g << 3;
+      int dBase = dataPos + g * 3;
+      int w =
+          ((data.get(dBase) & 0xFF) << 16)
+              | ((data.get(dBase + 1) & 0xFF) << 8)
+              | (data.get(dBase + 2) & 0xFF);
+      output[oBase] = (w >>> 21) & 7;
+      output[oBase + 1] = (w >>> 18) & 7;
+      output[oBase + 2] = (w >>> 15) & 7;
+      output[oBase + 3] = (w >>> 12) & 7;
+      output[oBase + 4] = (w >>> 9) & 7;
+      output[oBase + 5] = (w >>> 6) & 7;
+      output[oBase + 6] = (w >>> 3) & 7;
+      output[oBase + 7] = w & 7;
+    }
+    unpackRemainder(data, dataPos, output, count, fullGroups, 3, 7);
+  }
+
+  /** 4-bit values: 4 bytes → 8 values. */
+  private static void unpackBits4(ByteBuffer data, int dataPos, int[] output, int count) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int oBase = g << 3;
+      int dBase = dataPos + (g << 2);
+      long w =
+          ((long) (data.get(dBase) & 0xFF) << 24)
+              | ((long) (data.get(dBase + 1) & 0xFF) << 16)
+              | ((long) (data.get(dBase + 2) & 0xFF) << 8)
+              | (long) (data.get(dBase + 3) & 0xFF);
+      output[oBase] = (int) (w >>> 28) & 15;
+      output[oBase + 1] = (int) (w >>> 24) & 15;
+      output[oBase + 2] = (int) (w >>> 20) & 15;
+      output[oBase + 3] = (int) (w >>> 16) & 15;
+      output[oBase + 4] = (int) (w >>> 12) & 15;
+      output[oBase + 5] = (int) (w >>> 8) & 15;
+      output[oBase + 6] = (int) (w >>> 4) & 15;
+      output[oBase + 7] = (int) w & 15;
+    }
+    unpackRemainder(data, dataPos, output, count, fullGroups, 4, 15);
+  }
+
+  /** 5-bit values: 5 bytes → 8 values. */
+  private static void unpackBits5(ByteBuffer data, int dataPos, int[] output, int count) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int oBase = g << 3;
+      int dBase = dataPos + g * 5;
+      long w =
+          ((long) (data.get(dBase) & 0xFF) << 32)
+              | ((long) (data.get(dBase + 1) & 0xFF) << 24)
+              | ((long) (data.get(dBase + 2) & 0xFF) << 16)
+              | ((long) (data.get(dBase + 3) & 0xFF) << 8)
+              | (long) (data.get(dBase + 4) & 0xFF);
+      output[oBase] = (int) (w >>> 35) & 31;
+      output[oBase + 1] = (int) (w >>> 30) & 31;
+      output[oBase + 2] = (int) (w >>> 25) & 31;
+      output[oBase + 3] = (int) (w >>> 20) & 31;
+      output[oBase + 4] = (int) (w >>> 15) & 31;
+      output[oBase + 5] = (int) (w >>> 10) & 31;
+      output[oBase + 6] = (int) (w >>> 5) & 31;
+      output[oBase + 7] = (int) w & 31;
+    }
+    unpackRemainder(data, dataPos, output, count, fullGroups, 5, 31);
+  }
+
+  /** 6-bit values: 6 bytes → 8 values. */
+  private static void unpackBits6(ByteBuffer data, int dataPos, int[] output, int count) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int oBase = g << 3;
+      int dBase = dataPos + g * 6;
+      long w =
+          ((long) (data.get(dBase) & 0xFF) << 40)
+              | ((long) (data.get(dBase + 1) & 0xFF) << 32)
+              | ((long) (data.get(dBase + 2) & 0xFF) << 24)
+              | ((long) (data.get(dBase + 3) & 0xFF) << 16)
+              | ((long) (data.get(dBase + 4) & 0xFF) << 8)
+              | (long) (data.get(dBase + 5) & 0xFF);
+      output[oBase] = (int) (w >>> 42) & 63;
+      output[oBase + 1] = (int) (w >>> 36) & 63;
+      output[oBase + 2] = (int) (w >>> 30) & 63;
+      output[oBase + 3] = (int) (w >>> 24) & 63;
+      output[oBase + 4] = (int) (w >>> 18) & 63;
+      output[oBase + 5] = (int) (w >>> 12) & 63;
+      output[oBase + 6] = (int) (w >>> 6) & 63;
+      output[oBase + 7] = (int) w & 63;
+    }
+    unpackRemainder(data, dataPos, output, count, fullGroups, 6, 63);
+  }
+
+  /** 7-bit values: 7 bytes → 8 values. */
+  private static void unpackBits7(ByteBuffer data, int dataPos, int[] output, int count) {
+    int fullGroups = count >>> 3;
+    for (int g = 0; g < fullGroups; g += 1) {
+      int oBase = g << 3;
+      int dBase = dataPos + g * 7;
+      long w =
+          ((long) (data.get(dBase) & 0xFF) << 48)
+              | ((long) (data.get(dBase + 1) & 0xFF) << 40)
+              | ((long) (data.get(dBase + 2) & 0xFF) << 32)
+              | ((long) (data.get(dBase + 3) & 0xFF) << 24)
+              | ((long) (data.get(dBase + 4) & 0xFF) << 16)
+              | ((long) (data.get(dBase + 5) & 0xFF) << 8)
+              | (long) (data.get(dBase + 6) & 0xFF);
+      output[oBase] = (int) (w >>> 49) & 127;
+      output[oBase + 1] = (int) (w >>> 42) & 127;
+      output[oBase + 2] = (int) (w >>> 35) & 127;
+      output[oBase + 3] = (int) (w >>> 28) & 127;
+      output[oBase + 4] = (int) (w >>> 21) & 127;
+      output[oBase + 5] = (int) (w >>> 14) & 127;
+      output[oBase + 6] = (int) (w >>> 7) & 127;
+      output[oBase + 7] = (int) w & 127;
+    }
+    unpackRemainder(data, dataPos, output, count, fullGroups, 7, 127);
+  }
+
+  /**
+   * Unpacks the final partial group (rem = count & 7 values) for bit widths 1–7. Reads {@code
+   * ceil(rem * bitsPerValue / 8)} bytes and right-aligns before extracting.
+   */
+  private static void unpackRemainder(
+      ByteBuffer data,
+      int dataPos,
+      int[] output,
+      int count,
+      int fullGroups,
+      int bitsPerValue,
+      int mask) {
+    int rem = count & 7;
+    if (rem > 0) {
+      int oBase = fullGroups << 3;
+      int dBase = dataPos + fullGroups * bitsPerValue;
+      int remBits = rem * bitsPerValue;
+      int remBytes = (remBits + 7) >>> 3;
+      long word = 0;
+      for (int k = 0; k < remBytes; k += 1) {
+        word = (word << 8) | (data.get(dBase + k) & 0xFF);
+      }
+      word >>>= (remBytes << 3) - remBits;
+      for (int k = rem - 1; k >= 0; k--) {
+        output[oBase + k] = (int) (word & mask);
+        word >>>= bitsPerValue;
+      }
+    }
+  }
+}
diff --git a/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java b/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java
new file mode 100644
index 000000000000..5981dbe2713a
--- /dev/null
+++ b/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java
@@ -0,0 +1,350 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.mumbling;
+
+import java.nio.ByteBuffer;
+import org.apache.iceberg.util.Pair;
+
+/**
+ * PFOR (Patched Frame of Reference) encoding for arrays of unsigned byte values.
+ *
+ * <p>Implements the encoding described in Appendix A of the Mumbling bitmap specification. The
+ * input array is split into 256-value chunks (the last chunk may be shorter). Each chunk is
+ * independently compressed.
+ *
+ * <p>Each chunk is stored as:
+ *
+ * <ul>
+ *   <li>3-byte header: {@code b1|b2} (low/high nibbles of byte 0), {@code e} (byte 1), {@code m}
+ *       (byte 2)
+ *   <li>Primary array: {@code ceil(n * b1 / 8)} bytes — the low {@code b1} bits of every value,
+ *       packed MSB-first
+ *   <li>Exception offsets: {@code e} bytes — the chunk-relative position of each exception value
+ *   <li>Exception values: {@code ceil(e * b2 / 8)} bytes — bits {@code [b1, b1+b2)} of each
+ *       exception, packed MSB-first
+ * </ul>
+ *
+ * <p>During encoding, the chunk minimum {@code m} is subtracted from every value. During decoding,
+ * {@code m} is added back. When {@code b1 = 8}, no exceptions are produced and {@code m} is stored
+ * as 0 (original values are written directly). Bit packing and unpacking is delegated to {@link
+ * BitPacking}.
+ */
+class PFOREncoding {
+  private static final int CHUNK_SIZE = 256;
+
+  private PFOREncoding() {}
+
+  /**
+   * Encodes an array of unsigned byte values (each in {@code [0, 255]}) using PFOR encoding.
+   *
+   * @param values unsigned byte values to encode
+   * @return a newly allocated buffer
+   */
+  static ByteBuffer encode(int[] values) {
+    return encode(values, values.length, null);
+  }
+
+  /**
+   * Encodes an array of unsigned byte values (each in {@code [0, 255]}) using PFOR encoding.
+   *
+   * <p>If {@code buffer} has sufficient capacity its backing storage is reused; otherwise a new
+   * buffer is allocated. {@code buffer}'s position and limit are never modified. The returned
+   * buffer is always a slice with position=0 and limit=encoded length.
+   *
+   * @param values unsigned byte values to encode
+   * @param length number of values to encode
+   * @return a slice of {@code buffer} if capacity was sufficient, otherwise a slice of a newly
+   *     allocated buffer
+   */
+  static ByteBuffer encode(int[] values, int length) {
+    return encode(values, length, null);
+  }
+
+  /**
+   * Encodes the first {@code length} unsigned byte values (each in {@code [0, 255]}) from {@code
+   * values} using PFOR encoding.
+   *
+   * <p>If {@code buffer} has sufficient capacity its backing storage is reused; otherwise a new
+   * buffer is allocated. {@code buffer}'s position and limit are never modified. The returned
+   * buffer is always a slice with position=0 and limit=encoded length.
+   *
+   * @param values unsigned byte values to encode
+   * @param length number of values to encode
+   * @param buffer candidate buffer whose storage may be reused
+   * @return a slice of {@code buffer} if capacity was sufficient, otherwise a slice of a newly
+   *     allocated buffer
+   */
+  static ByteBuffer encode(int[] values, int length, ByteBuffer buffer) {
+    int numChunks = ceilDiv(length, CHUNK_SIZE);
+    // Worst-case per chunk is b1=8: 3-byte header + 1 byte per value. Any other b1 chosen by the
+    // encoder costs <= n bytes of data (otherwise b1=8 would have been selected instead).
+    int maxSize = 3 * numChunks + length;
+    ByteBuffer out =
+        buffer != null && buffer.capacity() >= maxSize ? buffer : ByteBuffer.allocate(maxSize);
+    int pos = 0;
+    int offset = 0;
+    while (offset < length) {
+      int chunkLength = Math.min(CHUNK_SIZE, length - offset);
+      pos += encodeChunk(values, offset, chunkLength, out, pos);
+      offset += chunkLength;
+    }
+
+    return out.slice(0, pos);
+  }
+
+  /**
+   * Decodes PFOR-encoded bytes back to unsigned byte values. Reads from {@code encoded.position()}
+   * using absolute indexing; the buffer's position is not modified.
+   *
+   * @param encoded PFOR-encoded ByteBuffer produced by {@link #encode}
+   * @param count total number of values to decode
+   * @return decoded unsigned byte values
+   */
+  static int[] decode(ByteBuffer encoded, int count) {
+    if (count == 0) {
+      return new int[0];
+    }
+
+    int[] output = new int[count];
+    int pos = encoded.position();
+    int start = 0;
+    while (start < count) {
+      int length = Math.min(CHUNK_SIZE, count - start);
+      pos += decodeChunk(encoded, pos, output, start, length);
+      start += length;
+    }
+
+    return output;
+  }
+
+  /**
+   * Encodes one chunk into {@code out} starting at absolute position {@code outPos}. Returns the
+   * number of bytes written.
+   */
+  private static int encodeChunk(int[] values, int start, int length, ByteBuffer out, int outPos) {
+    // Step 1: find base=min(values) for normalization
+    int base = min(values, start, length);
+
+    // Step 2: normalize by subtracting base
+    int[] normalized = new int[length];
+    int setBits = 0;
+    for (int i = 0; i < length; i += 1) {
+      normalized[i] = values[start + i] - base;
+      setBits |= normalized[i];
+    }
+
+    // Step 3: find the maximum bit width needed for normalized values
+    int maxWidth = width(setBits);
+
+    // Step 4: choose b1 to minimize total encoded data size (excluding 3-byte header)
+    Pair<Integer, Integer> widthAndExcCount = chooseBitWidth(normalized, length, maxWidth);
+    int b1 = widthAndExcCount.first();
+    int b2 = maxWidth - b1;
+    int excCount = widthAndExcCount.second();
+    int primaryBytes = ceilDiv(length * b1, 8);
+    int excValueBytes = ceilDiv(excCount * b2, 8);
+
+    // Special case: b1=8 means store original values as raw bytes with a constant header.
+    // b2, e, and m should be 0, so the header is always 0x08 0x00 0x00.
+    if (b1 == 8) {
+      out.put(outPos, (byte) 0x08);
+      out.put(outPos + 1, (byte) 0);
+      out.put(outPos + 2, (byte) 0);
+      return copyBytes(values, start, length, out, outPos + 3) - outPos;
+    }
+
+    // Header: b1 in low nibble, b2 in high nibble, then e, then m
+
+    out.put(outPos, (byte) ((b2 << 4) | b1));
+    out.put(outPos + 1, (byte) excCount);
+    out.put(outPos + 2, (byte) base);
+    int pos = outPos + 3;
+
+    // Primary array: low b1 bits of every value, packed MSB-first
+    if (b1 > 0) {
+      BitPacking.packBits(normalized, 0, length, out, pos, b1);
+      pos += primaryBytes;
+    }
+
+    // b2 is the bit width of exception values: bits [b1, b1+b2) of each exception
+    if (maxWidth > b1) {
+      int[] exceptionOffsets = new int[length];
+      int[] exceptionValues = new int[length];
+
+      // Step 5: collect exceptions (values that do not fit in b1 bits)
+      int excIndex = 0;
+      int threshold = 1 << b1;
+      for (int i = 0; i < length; i += 1) {
+        if (normalized[i] >= threshold) {
+          exceptionOffsets[excIndex] = i;
+          exceptionValues[excIndex] = normalized[i] >> b1;
+          excIndex += 1;
+        }
+      }
+
+      // Exception offsets (one byte per exception)
+      pos = copyBytes(exceptionOffsets, 0, excCount, out, pos);
+
+      // Exception values: bits [b1, b1+b2) of each exception, packed MSB-first
+      if (b2 > 0 && excCount > 0) {
+        if (b2 == 8) {
+          copyBytes(exceptionValues, 0, excCount, out, pos);
+        } else {
+          BitPacking.packBits(exceptionValues, 0, excCount, out, pos, b2);
+        }
+        pos += excValueBytes;
+      }
+    }
+
+    return pos - outPos;
+  }
+
+  /**
+   * Chooses the primary bit width {@code b1} that minimizes total encoded chunk size.
+   *
+   * <p>For each candidate {@code b} from 0 to 8, computes:
+   *
+   * <ul>
+   *   <li>{@code e}: number of values needing more than {@code b} bits
+   *   <li>{@code b2 = maxWidth - b}: bits needed for exception remainders
+   *   <li>total size = {@code ceil(n * b / 8) + e + ceil(e * b2 / 8)}
+   * </ul>
+   *
+   * Returns the {@code b} with minimum total size, preferring smaller {@code b} on ties.
+   */
+  private static Pair<Integer, Integer> chooseBitWidth(int[] normalized, int length, int maxWidth) {
+    int bestWidth = 0;
+    int bestSize = Integer.MAX_VALUE;
+    int bestExcCount = 0;
+
+    for (int b = 0; b <= maxWidth; b += 1) {
+      int e = 0;
+      if (b < 8) {
+        int threshold = 1 << b;
+        for (int i = 0; i < length; i += 1) {
+          if (normalized[i] >= threshold) {
+            e += 1;
+          }
+        }
+      }
+
+      int b2 = maxWidth - b;
+      int size = ceilDiv(length * b, 8) + e + ceilDiv(e * b2, 8);
+
+      if (size < bestSize) {
+        bestSize = size;
+        bestWidth = b;
+        bestExcCount = e;
+      }
+    }
+
+    return Pair.of(bestWidth, bestExcCount);
+  }
+
+  /**
+   * Decodes one chunk of PFOR-encoded data, writing decoded values into {@code output[start,
+   * start+length)}.
+   *
+   * @return the number of bytes read from {@code data}
+   */
+  private static int decodeChunk(ByteBuffer data, int pos, int[] output, int start, int length) {
+    int b1 = data.get(pos) & 0x0F;
+    int b2 = (data.get(pos) >> 4) & 0x0F;
+    int excCount = data.get(pos + 1) & 0xFF;
+    int base = data.get(pos + 2) & 0xFF;
+    int cursor = pos + 3;
+
+    // Special case: b1=8 means raw bytes; e is always 0
+    if (b1 == 8) {
+      for (int i = 0; i < length; i += 1) {
+        output[start + i] = (data.get(cursor + i) & 0xFF) + base;
+      }
+      return cursor + length - pos;
+    }
+
+    // Read primary array: low b1 bits of each value
+    int[] values = new int[length];
+    if (b1 > 0) {
+      BitPacking.unpackBits(data, cursor, values, length, b1);
+      cursor += ceilDiv(length * b1, 8);
+    }
+
+    // Read exception offsets
+    int[] offsets = new int[excCount];
+    for (int i = 0; i < excCount; i += 1) {
+      offsets[i] = data.get(cursor) & 0xFF;
+      cursor += 1;
+    }
+
+    // Read exception values and patch the primary values
+    if (b2 > 0 && excCount > 0) {
+      int[] excValues = new int[excCount];
+      if (b2 == 8) {
+        for (int i = 0; i < excCount; i += 1) {
+          excValues[i] = data.get(cursor + i) & 0xFF;
+        }
+      } else {
+        BitPacking.unpackBits(data, cursor, excValues, excCount, b2);
+      }
+      cursor += ceilDiv(excCount * b2, 8);
+
+      for (int i = 0; i < excCount; i += 1) {
+        values[offsets[i]] |= excValues[i] << b1;
+      }
+    }
+
+    // Add back the chunk minimum
+    for (int i = 0; i < length; i += 1) {
+      output[start + i] = values[i] + base;
+    }
+
+    return cursor - pos;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Utilities
+  // ---------------------------------------------------------------------------
+
+  private static int copyBytes(int[] src, int srcStart, int count, ByteBuffer out, int outPos) {
+    for (int i = 0; i < count; i += 1) {
+      out.put(outPos + i, (byte) src[srcStart + i]);
+    }
+    return outPos + count;
+  }
+
+  private static int min(int[] values, int start, int length) {
+    int min = 255;
+    for (int i = start; i < start + length; i += 1) {
+      if (values[i] < min) {
+        min = values[i];
+      }
+    }
+
+    return min;
+  }
+
+  /** Returns the number of bits required to represent {@code v} (0 for v=0). */
+  static int width(int value) {
+    return 32 - Integer.numberOfLeadingZeros(value);
+  }
+
+  static int ceilDiv(int a, int b) {
+    return (a + b - 1) / b;
+  }
+}
diff --git a/core/src/test/java/org/apache/iceberg/mumbling/PFOREncodingTestUtils.java b/core/src/test/java/org/apache/iceberg/mumbling/PFOREncodingTestUtils.java
new file mode 100644
index 000000000000..db568ab107f3
--- /dev/null
+++ b/core/src/test/java/org/apache/iceberg/mumbling/PFOREncodingTestUtils.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.mumbling;
+
+import java.util.Random;
+
+/** Shared data-generation utilities for {@link PFOREncoding} tests and benchmarks. */
+class PFOREncodingTestUtils {
+
+  private PFOREncodingTestUtils() {}
+
+  /**
+   * Generates {@code count} values drawn uniformly from {@code [0, maxValue]} using the given seed.
+   */
+  static int[] uniform(int count, long seed, int maxValue) {
+    Random random = new Random(seed);
+    int[] values = new int[count];
+    for (int i = 0; i < count; i++) {
+      values[i] = random.nextInt(maxValue + 1);
+    }
+    return values;
+  }
+
+  /**
+   * Generates {@code count} values where each value is drawn from {@code [0, 3]} except that each
+   * position has a {@code exceptionPct}% chance of being replaced with a full-range value
+   * {@code [0, 255]}.
+   */
+  static int[] sparse(int count, long seed, int exceptionPct) {
+    Random random = new Random(seed);
+    int[] values = new int[count];
+    for (int i = 0; i < count; i++) {
+      values[i] = random.nextInt(100) < exceptionPct ? random.nextInt(256) : random.nextInt(4);
+    }
+    return values;
+  }
+
+  /**
+   * Generates {@code count} values drawn uniformly from {@code [minValue, minValue + range]} using
+   * the given seed.
+   */
+  static int[] withOffset(int count, long seed, int minValue, int range) {
+    Random random = new Random(seed);
+    int[] values = new int[count];
+    for (int i = 0; i < count; i++) {
+      values[i] = minValue + random.nextInt(range + 1);
+    }
+    return values;
+  }
+}
diff --git a/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncoding.java b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncoding.java
new file mode 100644
index 000000000000..e2c28aca160f
--- /dev/null
+++ b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncoding.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.mumbling;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import org.apache.iceberg.util.ByteBuffers;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for PFOR encoding based on the examples in Appendix A of the Mumbling bitmap spec.
+ *
+ * <p>Note: examples 3 and 5 from the spec appear to contain typos in the encoded byte sequences.
+ * The expected byte arrays used here are derived from the spec algorithm applied to the stated
+ * decoded values, and the tests verify both the byte-level encoding and the roundtrip decode.
+ *
+ * <p>Example 3 spec header is {@code 80 00 00} but {@code e} must be 2 (two exceptions), so the
+ * correct header is {@code 80 02 00}.
+ *
+ * <p>Example 5 spec bytes are {@code 42 01 06 19 01 A0}, but with {@code b1=2} the cost-minimizing
+ * {@code b2 = maxWidth - b1 = 5 - 2 = 3} (not 4), producing header {@code 32 01 06}.
+ */
+public class TestPFOREncoding {
+
+  // ---------------------------------------------------------------------------
+  // Spec example 1: 256 values, all = 0
+  // Encoding: 0 bits per value, m = 0, no exceptions
+  // Expected: 00 00 00
+  // ---------------------------------------------------------------------------
+  @Test
+  public void testExample1AllZeros() {
+    int[] values = new int[256];
+    Arrays.fill(values, 0);
+
+    ByteBuffer encoded = PFOREncoding.encode(values);
+    assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(bytes(0x00, 0x00, 0x00));
+
+    int[] decoded = PFOREncoding.decode(encoded, 256);
+    assertThat(decoded).isEqualTo(values);
+  }
+
+  // ---------------------------------------------------------------------------
+  // Spec example 2: 51 values, all = 5
+  // Encoding: 0 bits per value, m = 5, no exceptions
+  // Expected: 00 00 05
+  // ---------------------------------------------------------------------------
+  @Test
+  public void testExample2AllFives() {
+    int[] values = new int[51];
+    Arrays.fill(values, 5);
+
+    ByteBuffer encoded = PFOREncoding.encode(values);
+    assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(bytes(0x00, 0x00, 0x05));
+
+    int[] decoded = PFOREncoding.decode(encoded, 51);
+    assertThat(decoded).isEqualTo(values);
+  }
+
+  // ---------------------------------------------------------------------------
+  // Spec example 3: [0, 0, 0, 0, 0xFF, 0, 0, 0xFE]
+  // Encoding: b1=0, b2=8, m=0, 2 exceptions at positions 4 and 7
+  //
+  // Note: the spec shows header bytes "80 00 00" but e=0 is incorrect; there are
+  // 2 exceptions so the correct encoding is "80 02 00 04 07 FF FE".
+  // ---------------------------------------------------------------------------
+  @Test
+  public void testExample3SparseExceptions() {
+    int[] values = {0, 0, 0, 0, 0xFF, 0, 0, 0xFE};
+
+    // b1=0, b2=8, e=2, m=0  →  header byte0=(8<<4)|0=0x80, byte1=0x02, byte2=0x00
+    // primary: empty (b1=0)
+    // offsets: 0x04, 0x07
+    // exception values: 0xFF, 0xFE (8 bits each)
+    byte[] expected =
+        bytes(0x80, 0x02, 0x00, /* offsets */ 0x04, 0x07, /* exceptions */ 0xFF, 0xFE);
+
+    ByteBuffer encoded = PFOREncoding.encode(values);
+    assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(expected);
+
+    int[] decoded = PFOREncoding.decode(encoded, values.length);
+    assertThat(decoded).isEqualTo(values);
+  }
+
+  // ---------------------------------------------------------------------------
+  // Spec example 4: [6, 7, 8]
+  // Encoding: b1=2, b2=0, m=6, no exceptions
+  // Expected: 02 00 06 18
+  // ---------------------------------------------------------------------------
+  @Test
+  public void testExample4TwoBitsNoExceptions() {
+    int[] values = {6, 7, 8};
+
+    // b1=2, b2=0, e=0, m=6  →  header byte0=(0<<4)|2=0x02, byte1=0x00, byte2=0x06
+    // normalized: [0, 1, 2]
+    // primary: 00 01 10 (padded) = 0b00011000 = 0x18
+    byte[] expected = bytes(0x02, 0x00, 0x06, 0x18);
+
+    ByteBuffer encoded = PFOREncoding.encode(values);
+    assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(expected);
+
+    int[] decoded = PFOREncoding.decode(encoded, values.length);
+    assertThat(decoded).isEqualTo(values);
+  }
+
+  // ---------------------------------------------------------------------------
+  // Spec example 5: [6, 34, 8, 7]
+  // Encoding: b1=2, b2=3, m=6, 1 exception (value 34 → normalized 28)
+  //
+  // Note: the spec shows "42 01 06 19 01 A0" but that byte sequence is inconsistent
+  // with the stated decoded values. With b1=2 and maxWidth=5, the cost-minimizing
+  // b2 = maxWidth - b1 = 3 (not 4). The correct encoding is:
+  //   header:    32 01 06  (b1=2, b2=3, e=1, m=6)
+  //   primary:   09        ([0,0,2,1] low 2 bits packed MSB-first)
+  //   offset:    01        (exception at position 1)
+  //   exception: E0        (7 = 28>>2, packed as 3 bits: 111_00000)
+  // ---------------------------------------------------------------------------
+  @Test
+  public void testExample5TwoBitsOneException() {
+    int[] values = {6, 34, 8, 7};
+
+    // b1=2, b2=3, e=1, m=6  →  header byte0=(3<<4)|2=0x32, byte1=0x01, byte2=0x06
+    // normalized: [0, 28, 2, 1]
+    // primary:    [00, 00, 10, 01] packed MSB-first = 0b00001001 = 0x09
+    // exception offset: 0x01  (position 1)
+    // exception value:  28>>2 = 7 = 0b111, 3 bits MSB-first = 0b111_00000 = 0xE0
+    byte[] expected = bytes(0x32, 0x01, 0x06, 0x09, 0x01, 0xE0);
+
+    ByteBuffer encoded = PFOREncoding.encode(values);
+    assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(expected);
+
+    int[] decoded = PFOREncoding.decode(encoded, values.length);
+    assertThat(decoded).isEqualTo(values);
+  }
+
+  // ---------------------------------------------------------------------------
+  // Helper to convert varargs ints to a byte array (treating each int as a byte)
+  // ---------------------------------------------------------------------------
+  private static byte[] bytes(int... values) {
+    byte[] result = new byte[values.length];
+    for (int i = 0; i < values.length; i++) {
+      result[i] = (byte) values[i];
+    }
+    return result;
+  }
+}
diff --git a/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java
new file mode 100644
index 000000000000..fb6cdf1bd021
--- /dev/null
+++ b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.mumbling;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.ByteBuffer;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+/**
+ * Randomized roundtrip tests for {@link PFOREncoding}.
+ *
+ * <p>Each test case embeds a fixed seed so that failures are reproducible. The cases cover:
+ *
+ * <ul>
+ *   <li><b>Uniform distributions</b> at several value ranges to exercise different primary bit
+ *       widths ({@code b1}): 0-1 → b1=1, 0-3 → b1=2, 0-31 → b1=5, 0-255 → b1=8.
+ *   <li><b>Size boundaries</b> around 8 (the pack/unpack group size) and 256 (the chunk size), to
+ *       exercise the remainder logic and multi-chunk splitting.
+ *   <li><b>Sparse distributions</b> — mostly-uniform base values with a small fraction of outliers
+ *       — to exercise the exception encoding path, where a large {@code b2} is needed.
+ *   <li><b>Non-zero minimum</b> values, to exercise the {@code m} subtraction.
+ * </ul>
+ */
+public class TestPFOREncodingRandom {
+
+  // ---------------------------------------------------------------------------
+  // Uniform distributions at different value ranges
+  // ---------------------------------------------------------------------------
+
+  /**
+   * Each row: (description, count, seed, maxValue). Values are drawn uniformly from [0, maxValue].
+   * The choice of maxValue determines which primary bit width {@code b1} the encoder will select.
+   */
+  static Stream<Arguments> uniformCases() {
+    return Stream.of(
+        // 1-bit range (b1=1): values in {0, 1}
+        Arguments.of("1-bit range, size=1",    1,   0x1a2b3c4dL, 1),
+        Arguments.of("1-bit range, size=7",    7,   0x5e6f7a8bL, 1),
+        Arguments.of("1-bit range, size=8",    8,   0xdeadbeefL, 1),
+        Arguments.of("1-bit range, size=9",    9,   0xcafebabeL, 1),
+        Arguments.of("1-bit range, size=256",  256, 0xf00dfa11L, 1),
+
+        // 2-bit range (b1=2): values in [0, 3]
+        Arguments.of("2-bit range, size=7",    7,   0x1337c0deL, 3),
+        Arguments.of("2-bit range, size=8",    8,   0xa5a5a5a5L, 3),
+        Arguments.of("2-bit range, size=9",    9,   0x0badf00dL, 3),
+        Arguments.of("2-bit range, size=100",  100, 0x12345678L, 3),
+        Arguments.of("2-bit range, size=256",  256, 0x87654321L, 3),
+
+        // 3-bit range (b1=3): values in [0, 7]
+        Arguments.of("3-bit range, size=7",    7,   0xabcdef01L, 7),
+        Arguments.of("3-bit range, size=24",   24,  0x01fedcbaL, 7),
+        Arguments.of("3-bit range, size=256",  256, 0xbeefcafeL, 7),
+
+        // 5-bit range (b1=5): values in [0, 31] — typical for Mumbling descriptor bytes
+        Arguments.of("5-bit range, size=7",    7,   0x11223344L, 31),
+        Arguments.of("5-bit range, size=8",    8,   0x44332211L, 31),
+        Arguments.of("5-bit range, size=63",   63,  0xaabbccddL, 31),
+        Arguments.of("5-bit range, size=64",   64,  0xddccbbaaL, 31),
+        Arguments.of("5-bit range, size=256",  256, 0x55667788L, 31),
+
+        // Full byte range (b1=8 or b1 chosen by cost): values in [0, 255]
+        Arguments.of("full range, size=1",     1,   0x99aabbccL, 255),
+        Arguments.of("full range, size=7",     7,   0xcc998877L, 255),
+        Arguments.of("full range, size=8",     8,   0x13572468L, 255),
+        Arguments.of("full range, size=9",     9,   0x24681357L, 255),
+        Arguments.of("full range, size=15",    15,  0xfedcba98L, 255),
+        Arguments.of("full range, size=16",    16,  0x89abcdefL, 255),
+        Arguments.of("full range, size=100",   100, 0x5a5a5a5aL, 255),
+        Arguments.of("full range, size=255",   255, 0xa1b2c3d4L, 255),
+        Arguments.of("full range, size=256",   256, 0xd4c3b2a1L, 255),
+        Arguments.of("full range, size=257",   257, 0x1f2e3d4cL, 255),
+        Arguments.of("full range, size=512",   512, 0x4c3d2e1fL, 255),
+        Arguments.of("full range, size=513",   513, 0xface0ffL,  255));
+  }
+
+  @ParameterizedTest(name = "{0}")
+  @MethodSource("uniformCases")
+  public void testUniformRandom(String name, int count, long seed, int maxValue) {
+    assertRoundtrip(PFOREncodingTestUtils.uniform(count, seed, maxValue));
+  }
+
+  // ---------------------------------------------------------------------------
+  // Sparse distributions — tests exception encoding
+  //
+  // Most values are drawn from a narrow base range (0-3), but a fraction are
+  // replaced with full-range values (0-255). This creates chunks where b1 is
+  // small but e > 0, which exercises the exception path with b2 > 0.
+  // ---------------------------------------------------------------------------
+
+  /**
+   * Each row: (description, count, seed, exceptionProbabilityPct). Values are 0-3 except that each
+   * position has a {@code exceptionProbabilityPct}% chance of becoming a full-range outlier
+   * (0-255).
+   */
+  static Stream<Arguments> sparseCases() {
+    return Stream.of(
+        Arguments.of("sparse 10%, size=7",    7,   0x2a3b4c5dL, 10),
+        Arguments.of("sparse 10%, size=8",    8,   0x3c4d5e6fL, 10),
+        Arguments.of("sparse 10%, size=9",    9,   0x4e5f607aL, 10),
+        Arguments.of("sparse 10%, size=100",  100, 0x6b7c8d9eL, 10),
+        Arguments.of("sparse 10%, size=256",  256, 0x8d9eafb0L, 10),
+        Arguments.of("sparse 25%, size=256",  256, 0xc1d2e3f4L, 25),
+        Arguments.of("sparse 50%, size=256",  256, 0xf4e3d2c1L, 50),
+        Arguments.of("sparse 10%, size=512",  512, 0x10293847L, 10));
+  }
+
+  @ParameterizedTest(name = "{0}")
+  @MethodSource("sparseCases")
+  public void testSparseRandom(String name, int count, long seed, int exceptionPct) {
+    assertRoundtrip(PFOREncodingTestUtils.sparse(count, seed, exceptionPct));
+  }
+
+  // ---------------------------------------------------------------------------
+  // Non-zero minimum — tests the m (minimum subtraction) mechanism
+  //
+  // Values are drawn from a range that does not include 0, so m > 0 and the
+  // encoder must subtract it before packing. After decoding, m is added back.
+  // ---------------------------------------------------------------------------
+
+  /**
+   * Each row: (description, count, seed, minValue, range). Values are drawn uniformly from
+   * [minValue, minValue + range].
+   */
+  static Stream<Arguments> offsetCases() {
+    return Stream.of(
+        Arguments.of("offset m=100 range=3, size=8",   8,   0x31415926L, 100, 3),
+        Arguments.of("offset m=100 range=3, size=256", 256, 0x27182818L, 100, 3),
+        Arguments.of("offset m=50  range=31, size=64", 64,  0x16180339L,  50, 31),
+        Arguments.of("offset m=200 range=55, size=100",100, 0x14142135L, 200, 55),
+        Arguments.of("offset m=128 range=127, size=256",256,0x17320508L, 128, 127));
+  }
+
+  @ParameterizedTest(name = "{0}")
+  @MethodSource("offsetCases")
+  public void testOffsetRandom(String name, int count, long seed, int minValue, int range) {
+    assertRoundtrip(PFOREncodingTestUtils.withOffset(count, seed, minValue, range));
+  }
+
+  // ---------------------------------------------------------------------------
+  // Helper
+  // ---------------------------------------------------------------------------
+
+  private static void assertRoundtrip(int[] values) {
+    ByteBuffer encoded = PFOREncoding.encode(values);
+    int[] decoded = PFOREncoding.decode(encoded, values.length);
+    assertThat(decoded).isEqualTo(values);
+  }
+}
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index a590aaf71405..b4fd82ef48ac 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -59,6 +59,7 @@ httpcomponents-httpclient5 = "5.6.1"
 hive2 = { strictly = "2.3.10"} # see rich version usage explanation above
 immutables-value = "2.12.2"
 jackson-annotations = "2.21"
+javafastpfor = "0.2.1"
 jackson-bom = "2.21.4"
 jackson214 = { strictly = "2.14.2"}
 jackson215 = { strictly = "2.15.2"} # see rich version usage explanation above
@@ -154,6 +155,7 @@ hive2-service = { module = "org.apache.hive:hive-service", version.ref = "hive2"
 httpcomponents-httpclient5 = { module = "org.apache.httpcomponents.client5:httpclient5", version.ref = "httpcomponents-httpclient5" }
 immutables-value = { module = "org.immutables:value", version.ref = "immutables-value" }
 jackson-bom = { module = "com.fasterxml.jackson:jackson-bom", version.ref = "jackson-bom" }
+javafastpfor = { module = "me.lemire.integercompression:JavaFastPFOR", version.ref = "javafastpfor" }
 jackson-core = { module = "com.fasterxml.jackson.core:jackson-core", version.ref = "jackson-bom" }
 jackson-databind = { module = "com.fasterxml.jackson.core:jackson-databind", version.ref = "jackson-bom" }
 jackson-annotations = { module = "com.fasterxml.jackson.core:jackson-annotations", version.ref = "jackson-annotations" }
diff --git a/jmh.gradle b/jmh.gradle
index a4d794f1e41a..f7c12a2d562a 100644
--- a/jmh.gradle
+++ b/jmh.gradle
@@ -85,4 +85,10 @@ configure(jmhProjects) {
   }
 
   tasks.jmh.finalizedBy tasks.jmhReport
+
+  if (project.path == ':iceberg-core') {
+    dependencies {
+      jmhImplementation(libs.javafastpfor)
+    }
+  }
 }

From 6ec6d96682a5a320ccedc0f5d2adccd5dc715d92 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Fri, 17 Apr 2026 12:20:07 -0700
Subject: [PATCH 2/5] Add initial read-only bitmap implementation.

---
 .../iceberg/mumbling/MumblingBitmap.java      | 120 ++++++
 .../apache/iceberg/mumbling/PFOREncoding.java |  65 ++++
 .../iceberg/mumbling/TestMumblingBitmap.java  | 352 ++++++++++++++++++
 3 files changed, 537 insertions(+)
 create mode 100644 core/src/main/java/org/apache/iceberg/mumbling/MumblingBitmap.java
 create mode 100644 core/src/test/java/org/apache/iceberg/mumbling/TestMumblingBitmap.java

diff --git a/core/src/main/java/org/apache/iceberg/mumbling/MumblingBitmap.java b/core/src/main/java/org/apache/iceberg/mumbling/MumblingBitmap.java
new file mode 100644
index 000000000000..8758068e5624
--- /dev/null
+++ b/core/src/main/java/org/apache/iceberg/mumbling/MumblingBitmap.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.mumbling;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Read-only view of a Mumbling compressed bitmap stored in a {@link ByteBuffer}.
+ *
+ * <p>The bitmap is lazy: no decoding is done at construction time. On the first call to {@link
+ * #isSet}, the PFOR-encoded descriptor array is decoded and used to build an offsets array that
+ * maps each container index to its absolute byte position in the buffer. This offsets array is the
+ * only derived state kept by this class.
+ *
+ * <p>Format (all integers unsigned, little-endian):
+ *
+ * <ul>
+ *   <li>Header (6 bytes): version (1), cardinality (3), container count (2)
+ *   <li>Descriptor array: PFOR-encoded, one byte per container
+ *   <li>Containers: concatenated sparse (0–31 bytes) or dense (32 bytes) containers
+ * </ul>
+ */
+class MumblingBitmap {
+  private static final int VERSION = 1;
+  private static final int HEADER_SIZE = 6;
+  private static final int DENSE_CONTAINER_BYTES = 32;
+
+  private final ByteBuffer data;
+  private int[] offsets;
+
+  MumblingBitmap(ByteBuffer data) {
+    int version = data.get(data.position()) & 0xFF;
+    if (version != VERSION) {
+      throw new IllegalArgumentException("Unsupported Mumbling bitmap version: " + version);
+    }
+    this.data = data;
+  }
+
+  /**
+   * Returns {@code true} if the bit at {@code pos} is set in the bitmap.
+   *
+   * <p>Positions beyond the range of any container are always unset.
+   */
+  boolean isSet(int pos) {
+    int containerIndex = pos >>> 8;
+    int posInContainer = pos & 0xFF;
+
+    int[] offs = ensureOffsets();
+    if (containerIndex >= offs.length - 1) {
+      return false;
+    }
+
+    int containerStart = offs[containerIndex];
+    int containerLength = offs[containerIndex + 1] - containerStart;
+
+    if (containerLength == DENSE_CONTAINER_BYTES) {
+      // Dense: 32-byte bitset, MSB of byte 0 is position 0
+      int byteIdx = posInContainer >>> 3;
+      int bitIdx = 7 - (posInContainer & 7);
+      return ((data.get(containerStart + byteIdx) >>> bitIdx) & 1) == 1;
+    } else {
+      // Sparse: sorted list of set positions; scan until found or exceeded
+      for (int i = 0; i < containerLength; i += 1) {
+        int stored = data.get(containerStart + i) & 0xFF;
+        if (stored == posInContainer) {
+          return true;
+        }
+        if (stored > posInContainer) {
+          return false;
+        }
+      }
+      return false;
+    }
+  }
+
+  private int[] ensureOffsets() {
+    if (offsets == null) {
+      offsets = buildOffsets();
+    }
+    return offsets;
+  }
+
+  private int[] buildOffsets() {
+    int base = data.position();
+
+    // Container count: bytes 4–5, little-endian
+    int containerCount = (data.get(base + 4) & 0xFF) | ((data.get(base + 5) & 0xFF) << 8);
+
+    // Decode the PFOR descriptor array directly into a relative offset array, tracking bytes
+    // consumed so we know where the containers section starts
+    int[] offsets = new int[containerCount + 1];
+    ByteBuffer descriptorBuffer = data.duplicate();
+    descriptorBuffer.position(base + HEADER_SIZE);
+    int descriptorBytes = PFOREncoding.decodeOffsets(descriptorBuffer, containerCount, offsets);
+
+    // Adjust relative offsets to absolute positions in the buffer
+    int containersStart = base + HEADER_SIZE + descriptorBytes;
+    for (int i = 0; i <= containerCount; i += 1) {
+      offsets[i] += containersStart;
+    }
+
+    return offsets;
+  }
+}
diff --git a/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java b/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java
index 5981dbe2713a..d40e05bd8d7c 100644
--- a/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java
+++ b/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java
@@ -47,6 +47,9 @@
  */
 class PFOREncoding {
   private static final int CHUNK_SIZE = 256;
+  private static final int DENSE_DESCRIPTOR_BIT = 0x20;
+  private static final int SPARSE_LENGTH_MASK = 0x1F;
+  private static final int DENSE_CONTAINER_BYTES = 32;
 
   private PFOREncoding() {}
 
@@ -133,6 +136,68 @@ static int[] decode(ByteBuffer encoded, int count) {
     return output;
   }
 
+  /**
+   * Encodes a container offset array as PFOR-encoded container lengths.
+   *
+   * <p>{@code offsets} must have {@code count + 1} entries. The length of container {@code i} is
+   * {@code offsets[i + 1] - offsets[i]} and is used as the descriptor value to encode.
+   *
+   * @param offsets container offset array with {@code count + 1} entries
+   * @param count number of containers
+   * @return PFOR-encoded descriptor bytes
+   */
+  static ByteBuffer encodeOffsets(int[] offsets, int count) {
+    int[] lengths = new int[count];
+    for (int i = 0; i < count; i += 1) {
+      lengths[i] = offsets[i + 1] - offsets[i];
+    }
+
+    return encode(lengths, count);
+  }
+
+  /**
+   * Decodes PFOR-encoded descriptor bytes directly into a container offset array.
+   *
+   * <p>Reads from {@code encoded.position()} without modifying the buffer's position. Each decoded
+   * descriptor value is treated as a container length and accumulated into {@code offsets} as a
+   * prefix sum starting at 0. The caller is responsible for adjusting the resulting relative
+   * offsets to absolute positions.
+   *
+   * <p>{@code offsets} must have {@code count + 1} entries. On return, {@code offsets[i]} is the
+   * cumulative byte length of containers {@code 0..i-1} and {@code offsets[count]} is the total
+   * byte length of all containers.
+   *
+   * @param encoded PFOR-encoded descriptor bytes
+   * @param count number of containers (descriptors) to decode
+   * @param offsets array to fill; must have length &gt;= count + 1
+   * @return number of bytes consumed from {@code encoded}
+   */
+  static int decodeOffsets(ByteBuffer encoded, int count, int[] offsets) {
+    if (count == 0) {
+      offsets[0] = 0;
+      return 0;
+    }
+
+    int[] chunk = new int[Math.min(CHUNK_SIZE, count)];
+    int pos = encoded.position();
+    int containerIdx = 0;
+    int cumulative = 0;
+    while (containerIdx < count) {
+      int chunkLen = Math.min(CHUNK_SIZE, count - containerIdx);
+      pos += decodeChunk(encoded, pos, chunk, 0, chunkLen);
+      for (int i = 0; i < chunkLen; i += 1) {
+        offsets[containerIdx + i] = cumulative;
+        int descriptor = chunk[i];
+        cumulative += (descriptor & DENSE_DESCRIPTOR_BIT) != 0 ? DENSE_CONTAINER_BYTES : (descriptor & SPARSE_LENGTH_MASK);
+      }
+
+      containerIdx += chunkLen;
+    }
+    offsets[count] = cumulative;
+
+    return pos - encoded.position();
+  }
+
   /**
    * Encodes one chunk into {@code out} starting at absolute position {@code outPos}. Returns the
    * number of bytes written.
diff --git a/core/src/test/java/org/apache/iceberg/mumbling/TestMumblingBitmap.java b/core/src/test/java/org/apache/iceberg/mumbling/TestMumblingBitmap.java
new file mode 100644
index 000000000000..3745ced61f2e
--- /dev/null
+++ b/core/src/test/java/org/apache/iceberg/mumbling/TestMumblingBitmap.java
@@ -0,0 +1,352 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.mumbling;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import org.junit.jupiter.api.Test;
+
+class TestMumblingBitmap {
+
+  // ---------------------------------------------------------------------------
+  // Empty bitmap (0 containers)
+  // ---------------------------------------------------------------------------
+  @Test
+  void testEmptyBitmap() {
+    MumblingBitmap bitmap = bitmap(new int[0]);
+    assertThat(bitmap.isSet(0)).isFalse();
+    assertThat(bitmap.isSet(255)).isFalse();
+    assertThat(bitmap.isSet(256)).isFalse();
+  }
+
+  // ---------------------------------------------------------------------------
+  // Single empty sparse container (descriptor = 0, no container bytes)
+  // ---------------------------------------------------------------------------
+  @Test
+  void testEmptySparseContainer() {
+    MumblingBitmap bitmap = bitmap(sparse());
+    assertThat(bitmap.isSet(0)).isFalse();
+    assertThat(bitmap.isSet(100)).isFalse();
+    assertThat(bitmap.isSet(255)).isFalse();
+  }
+
+  // ---------------------------------------------------------------------------
+  // Single sparse container with specific positions
+  // ---------------------------------------------------------------------------
+  @Test
+  void testSparseContainerSetPositions() {
+    MumblingBitmap bitmap = bitmap(sparse(0, 5, 100, 255));
+    assertThat(bitmap.isSet(0)).isTrue();
+    assertThat(bitmap.isSet(5)).isTrue();
+    assertThat(bitmap.isSet(100)).isTrue();
+    assertThat(bitmap.isSet(255)).isTrue();
+
+    assertThat(bitmap.isSet(1)).isFalse();
+    assertThat(bitmap.isSet(4)).isFalse();
+    assertThat(bitmap.isSet(6)).isFalse();
+    assertThat(bitmap.isSet(99)).isFalse();
+    assertThat(bitmap.isSet(101)).isFalse();
+    assertThat(bitmap.isSet(254)).isFalse();
+  }
+
+  // ---------------------------------------------------------------------------
+  // Full sparse container (31 positions, the maximum)
+  // ---------------------------------------------------------------------------
+  @Test
+  void testMaxSparseContainer() {
+    int[] positions = new int[31];
+    for (int i = 0; i < 31; i += 1) {
+      positions[i] = i * 8; // 0, 8, 16, ..., 240
+    }
+    MumblingBitmap bitmap = bitmap(sparse(positions));
+    for (int p : positions) {
+      assertThat(bitmap.isSet(p)).isTrue();
+    }
+    assertThat(bitmap.isSet(1)).isFalse();
+    assertThat(bitmap.isSet(7)).isFalse();
+    assertThat(bitmap.isSet(255)).isFalse();
+  }
+
+  // ---------------------------------------------------------------------------
+  // Dense container: all bits set (32 bytes of 0xFF)
+  // ---------------------------------------------------------------------------
+  @Test
+  void testDenseContainerAllSet() {
+    byte[] container = new byte[32];
+    Arrays.fill(container, (byte) 0xFF);
+    MumblingBitmap bitmap = bitmap(dense(container));
+    for (int i = 0; i < 256; i += 1) {
+      assertThat(bitmap.isSet(i)).isTrue();
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Dense container: no bits set (32 bytes of 0x00)
+  // ---------------------------------------------------------------------------
+  @Test
+  void testDenseContainerNoneSet() {
+    byte[] container = new byte[32];
+    MumblingBitmap bitmap = bitmap(dense(container));
+    for (int i = 0; i < 256; i += 1) {
+      assertThat(bitmap.isSet(i)).isFalse();
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Dense container spec examples
+  // ---------------------------------------------------------------------------
+
+  // `FF FF FF FF 00 ... 00` → positions 0-31
+  @Test
+  void testDenseSpecExample1() {
+    byte[] container = new byte[32];
+    container[0] = (byte) 0xFF;
+    container[1] = (byte) 0xFF;
+    container[2] = (byte) 0xFF;
+    container[3] = (byte) 0xFF;
+    MumblingBitmap bitmap = bitmap(dense(container));
+    for (int i = 0; i <= 31; i += 1) {
+      assertThat(bitmap.isSet(i)).isTrue();
+    }
+    assertThat(bitmap.isSet(32)).isFalse();
+    assertThat(bitmap.isSet(255)).isFalse();
+  }
+
+  // `FF FF FF FF A0 00 ... 00` → positions 0-32
+  // 0xA0 = 10100000: MSB (pos 32) is set, pos 33 is not
+  @Test
+  void testDenseSpecExample2() {
+    byte[] container = new byte[32];
+    container[0] = (byte) 0xFF;
+    container[1] = (byte) 0xFF;
+    container[2] = (byte) 0xFF;
+    container[3] = (byte) 0xFF;
+    container[4] = (byte) 0xA0;
+    MumblingBitmap bitmap = bitmap(dense(container));
+    for (int i = 0; i <= 32; i += 1) {
+      assertThat(bitmap.isSet(i)).isTrue();
+    }
+    assertThat(bitmap.isSet(33)).isFalse();
+    assertThat(bitmap.isSet(255)).isFalse();
+  }
+
+  // `FF FF 00 ... 00 FF FF` → positions 0-15, 240-255
+  @Test
+  void testDenseSpecExample3() {
+    byte[] container = new byte[32];
+    container[0] = (byte) 0xFF;
+    container[1] = (byte) 0xFF;
+    container[30] = (byte) 0xFF;
+    container[31] = (byte) 0xFF;
+    MumblingBitmap bitmap = bitmap(dense(container));
+    for (int i = 0; i <= 15; i += 1) {
+      assertThat(bitmap.isSet(i)).isTrue();
+    }
+    for (int i = 240; i <= 255; i += 1) {
+      assertThat(bitmap.isSet(i)).isTrue();
+    }
+    assertThat(bitmap.isSet(16)).isFalse();
+    assertThat(bitmap.isSet(239)).isFalse();
+  }
+
+  // `AA AA ... AA AA` → even positions: 0, 2, 4, ...
+  // 0xAA = 10101010: MSB-first gives positions 0, 2, 4, 6 set per byte
+  @Test
+  void testDenseSpecExample4() {
+    byte[] container = new byte[32];
+    Arrays.fill(container, (byte) 0xAA);
+    MumblingBitmap bitmap = bitmap(dense(container));
+    for (int i = 0; i < 256; i += 1) {
+      assertThat(bitmap.isSet(i)).isEqualTo(i % 2 == 0);
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Multiple containers: positions in different containers
+  // Container 0 (pos 0–255): sparse {5}
+  // Container 1 (pos 256–511): sparse {10} → global position 266
+  // Container 2 (pos 512–767): empty sparse
+  // ---------------------------------------------------------------------------
+  @Test
+  void testMultipleContainers() {
+    MumblingBitmap bitmap =
+        bitmap(
+            new ContainerSpec[] {
+              sparse(5), sparse(10), sparse()
+            });
+
+    assertThat(bitmap.isSet(5)).isTrue(); // container 0, pos 5
+    assertThat(bitmap.isSet(266)).isTrue(); // container 1, pos 10
+    assertThat(bitmap.isSet(512)).isFalse(); // container 2, empty
+    assertThat(bitmap.isSet(4)).isFalse();
+    assertThat(bitmap.isSet(265)).isFalse();
+    assertThat(bitmap.isSet(267)).isFalse();
+  }
+
+  // ---------------------------------------------------------------------------
+  // Mixed sparse and dense containers
+  // Container 0: dense with byte 0 = 0xFF (positions 0–7 set)
+  // Container 1: sparse {0} (global position 256)
+  // ---------------------------------------------------------------------------
+  @Test
+  void testMixedSparseAndDense() {
+    byte[] denseContainer = new byte[32];
+    denseContainer[0] = (byte) 0xFF;
+    MumblingBitmap bitmap =
+        bitmap(
+            new ContainerSpec[] {
+              dense(denseContainer), sparse(0)
+            });
+
+    for (int i = 0; i < 8; i += 1) {
+      assertThat(bitmap.isSet(i)).isTrue();
+    }
+    assertThat(bitmap.isSet(8)).isFalse();
+    assertThat(bitmap.isSet(256)).isTrue(); // container 1, pos 0
+    assertThat(bitmap.isSet(257)).isFalse();
+  }
+
+  // ---------------------------------------------------------------------------
+  // Position beyond container count is always unset
+  // ---------------------------------------------------------------------------
+  @Test
+  void testPositionBeyondContainerCount() {
+    MumblingBitmap bitmap = bitmap(sparse(5));
+    // Only container 0 exists; container 1 (pos 256+) does not
+    assertThat(bitmap.isSet(256)).isFalse();
+    assertThat(bitmap.isSet(511)).isFalse();
+  }
+
+  // ---------------------------------------------------------------------------
+  // Buffer with non-zero initial position
+  // ---------------------------------------------------------------------------
+  @Test
+  void testBufferWithOffset() {
+    // Prepend 4 bytes of garbage before the actual bitmap data
+    ByteBuffer raw = build(new ContainerSpec[] {sparse(42)});
+    byte[] rawBytes = new byte[raw.remaining()];
+    raw.get(rawBytes);
+
+    ByteBuffer padded = ByteBuffer.allocate(4 + rawBytes.length);
+    padded.position(4);
+    padded.put(rawBytes);
+    padded.position(4); // position the buffer at the start of bitmap data
+
+    MumblingBitmap bitmap = new MumblingBitmap(padded);
+    assertThat(bitmap.isSet(42)).isTrue();
+    assertThat(bitmap.isSet(43)).isFalse();
+  }
+
+  // ---------------------------------------------------------------------------
+  // Lazy init: calling isSet multiple times produces consistent results
+  // ---------------------------------------------------------------------------
+  @Test
+  void testLazyInitConsistency() {
+    MumblingBitmap bitmap = bitmap(sparse(1, 2, 3));
+    for (int trial = 0; trial < 3; trial += 1) {
+      assertThat(bitmap.isSet(1)).isTrue();
+      assertThat(bitmap.isSet(2)).isTrue();
+      assertThat(bitmap.isSet(3)).isTrue();
+      assertThat(bitmap.isSet(0)).isFalse();
+      assertThat(bitmap.isSet(4)).isFalse();
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Helpers
+  // ---------------------------------------------------------------------------
+
+  /** Descriptor + bytes for a sparse container. */
+  private static ContainerSpec sparse(int... positions) {
+    byte[] bytes = new byte[positions.length];
+    for (int i = 0; i < positions.length; i += 1) {
+      bytes[i] = (byte) positions[i];
+    }
+    return new ContainerSpec(positions.length, bytes);
+  }
+
+  /** Descriptor + bytes for a dense container. */
+  private static ContainerSpec dense(byte[] container) {
+    if (container.length != 32) {
+      throw new IllegalArgumentException("Dense container must be 32 bytes");
+    }
+    return new ContainerSpec(32, container);
+  }
+
+  private static class ContainerSpec {
+    final int descriptor;
+    final byte[] bytes;
+
+    ContainerSpec(int descriptor, byte[] bytes) {
+      this.descriptor = descriptor;
+      this.bytes = bytes;
+    }
+  }
+
+  /** Builds a bitmap with a single container. */
+  private static MumblingBitmap bitmap(ContainerSpec spec) {
+    return new MumblingBitmap(build(new ContainerSpec[] {spec}));
+  }
+
+  /** Builds a bitmap with no containers. */
+  private static MumblingBitmap bitmap(int[] ignored) {
+    return new MumblingBitmap(build(new ContainerSpec[0]));
+  }
+
+  /** Builds a bitmap with multiple containers. */
+  private static MumblingBitmap bitmap(ContainerSpec[] specs) {
+    return new MumblingBitmap(build(specs));
+  }
+
+  private static ByteBuffer build(ContainerSpec[] specs) {
+    int[] descriptors = new int[specs.length];
+    for (int i = 0; i < specs.length; i += 1) {
+      descriptors[i] = specs[i].descriptor;
+    }
+
+    ByteBuffer encodedDescriptors =
+        specs.length > 0 ? PFOREncoding.encode(descriptors, specs.length) : ByteBuffer.allocate(0);
+
+    int totalContainerBytes = 0;
+    for (ContainerSpec spec : specs) {
+      totalContainerBytes += spec.bytes.length;
+    }
+
+    int totalSize = 6 + encodedDescriptors.remaining() + totalContainerBytes;
+    ByteBuffer buf = ByteBuffer.allocate(totalSize);
+
+    // Header: version (1 byte), cardinality (3 bytes LE), container count (2 bytes LE)
+    buf.put((byte) 1);
+    buf.put((byte) 0);
+    buf.put((byte) 0);
+    buf.put((byte) 0);
+    buf.put((byte) (specs.length & 0xFF));
+    buf.put((byte) ((specs.length >>> 8) & 0xFF));
+
+    buf.put(encodedDescriptors);
+    for (ContainerSpec spec : specs) {
+      buf.put(spec.bytes);
+    }
+
+    buf.flip();
+    return buf;
+  }
+}

From 13197bfd2ad1c5da54a5a7ba58a0778252fbe74e Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Mon, 1 Jun 2026 16:57:18 -0700
Subject: [PATCH 3/5] Core: Clean up generated code.

---
 .../mumbling/PFOREncodingBenchmark.java       |   44 +-
 .../apache/iceberg/mumbling/BitPacking.java   | 1089 ++++++++++-------
 .../iceberg/mumbling/MumblingBitmap.java      |  115 +-
 .../apache/iceberg/mumbling/PFOREncoding.java |  533 ++++----
 .../mumbling/PFOREncodingTestUtils.java       |   66 -
 .../iceberg/mumbling/PFORRandomData.java      |   47 +
 .../iceberg/mumbling/TestMumblingBitmap.java  |  305 ++---
 .../iceberg/mumbling/TestPFOREncoding.java    |   86 +-
 .../mumbling/TestPFOREncodingRandom.java      |  174 +--
 9 files changed, 1336 insertions(+), 1123 deletions(-)
 delete mode 100644 core/src/test/java/org/apache/iceberg/mumbling/PFOREncodingTestUtils.java
 create mode 100644 core/src/test/java/org/apache/iceberg/mumbling/PFORRandomData.java

diff --git a/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java b/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java
index 3c6a21d658e4..cbb5d69e85f8 100644
--- a/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java
+++ b/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java
@@ -19,6 +19,7 @@
 package org.apache.iceberg.mumbling;
 
 import java.nio.ByteBuffer;
+import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import me.lemire.integercompression.FastPFOR128;
 import me.lemire.integercompression.IntWrapper;
@@ -62,10 +63,6 @@
 @Timeout(time = 5, timeUnit = TimeUnit.MINUTES)
 public class PFOREncodingBenchmark {
 
-  // Fixed seeds for reproducibility
-  private static final long DESCRIPTOR_SEED = 0x5a5a5a5a5a5a5a5aL;
-  private static final long UNIFORM_SEED = 0xa1b2c3d4e5f60708L;
-
   // Iceberg PFOR input arrays
   private int[] descriptorValues;
   private int[] uniformValues;
@@ -77,6 +74,9 @@ public class PFOREncodingBenchmark {
   // Reusable encode buffer (avoids allocation in the encode hot path)
   private ByteBuffer encodeBuffer;
 
+  // Reusable decode output (avoids allocation in the decode hot path)
+  private int[] decodeOutput;
+
   // JavaFastPFOR compressed arrays (pre-encoded for decode benchmarks)
   private int[] descriptorFastPFOREncoded;
   private int[] uniformFastPFOREncoded;
@@ -86,18 +86,23 @@ public class PFOREncodingBenchmark {
 
   @Setup
   public void setupBenchmark() {
+    Random random = new Random(1938745);
+
     // 256-value descriptor-like data: mostly [0,31] with ~5% [0,255] outliers
-    descriptorValues = PFOREncodingTestUtils.sparse(256, DESCRIPTOR_SEED, 5);
+    descriptorValues = PFORRandomData.exceptions(random, 256, 0.5f);
 
     // 256-value uniform byte data
-    uniformValues = PFOREncodingTestUtils.uniform(256, UNIFORM_SEED, 255);
+    uniformValues = PFORRandomData.uniform(random, 256, 255);
 
     // Reusable encode buffer: worst-case for a single 256-value chunk
     encodeBuffer = ByteBuffer.allocate(3 + 256);
 
+    // Reusable decode output: one entry per value in a chunk
+    decodeOutput = new int[256];
+
     // Pre-encode for decode benchmarks
-    descriptorEncoded = PFOREncoding.encode(descriptorValues);
-    uniformEncoded = PFOREncoding.encode(uniformValues);
+    descriptorEncoded = PFOREncoding.encode(descriptorValues, descriptorValues.length);
+    uniformEncoded = PFOREncoding.encode(uniformValues, uniformValues.length);
 
     // Pre-encode with JavaFastPFOR for decode benchmarks
     FastPFOR128 codec = new FastPFOR128();
@@ -115,13 +120,17 @@ public void setupBenchmark() {
   @Benchmark
   @Threads(1)
   public void encodeDescriptorIceberg(Blackhole blackhole) {
-    blackhole.consume(PFOREncoding.encode(descriptorValues, descriptorValues.length, encodeBuffer));
+    blackhole.consume(
+        PFOREncoding.encode(
+            descriptorValues, 0, encodeBuffer, encodeBuffer.position(), descriptorValues.length));
   }
 
   @Benchmark
   @Threads(1)
   public void decodeDescriptorIceberg(Blackhole blackhole) {
-    blackhole.consume(PFOREncoding.decode(descriptorEncoded, descriptorValues.length));
+    PFOREncoding.decode(
+        descriptorEncoded, descriptorEncoded.position(), decodeOutput, 0, descriptorValues.length);
+    blackhole.consume(decodeOutput);
   }
 
   // ---------------------------------------------------------------------------
@@ -131,13 +140,17 @@ public void decodeDescriptorIceberg(Blackhole blackhole) {
   @Benchmark
   @Threads(1)
   public void encodeUniformIceberg(Blackhole blackhole) {
-    blackhole.consume(PFOREncoding.encode(uniformValues, uniformValues.length, encodeBuffer));
+    blackhole.consume(
+        PFOREncoding.encode(
+            uniformValues, 0, encodeBuffer, encodeBuffer.position(), uniformValues.length));
   }
 
   @Benchmark
   @Threads(1)
   public void decodeUniformIceberg(Blackhole blackhole) {
-    blackhole.consume(PFOREncoding.decode(uniformEncoded, uniformValues.length));
+    PFOREncoding.decode(
+        uniformEncoded, uniformEncoded.position(), decodeOutput, 0, uniformValues.length);
+    blackhole.consume(decodeOutput);
   }
 
   // ---------------------------------------------------------------------------
@@ -155,7 +168,7 @@ public void encodeDescriptorFastPFOR(Blackhole blackhole) {
   @Threads(1)
   public void decodeDescriptorFastPFOR(Blackhole blackhole) {
     FastPFOR128 codec = new FastPFOR128();
-    blackhole.consume(fastPFORDecode(codec, descriptorFastPFOREncoded, descriptorValues.length));
+    blackhole.consume(fastPFORDecode(codec, descriptorFastPFOREncoded, fastPFOROutputBuffer));
   }
 
   // ---------------------------------------------------------------------------
@@ -173,7 +186,7 @@ public void encodeUniformFastPFOR(Blackhole blackhole) {
   @Threads(1)
   public void decodeUniformFastPFOR(Blackhole blackhole) {
     FastPFOR128 codec = new FastPFOR128();
-    blackhole.consume(fastPFORDecode(codec, uniformFastPFOREncoded, uniformValues.length));
+    blackhole.consume(fastPFORDecode(codec, uniformFastPFOREncoded, fastPFOROutputBuffer));
   }
 
   // ---------------------------------------------------------------------------
@@ -190,8 +203,7 @@ private static int[] fastPFOREncode(FastPFOR128 codec, int[] values) {
     return result;
   }
 
-  private static int[] fastPFORDecode(FastPFOR128 codec, int[] encoded, int count) {
-    int[] output = new int[count];
+  private static int[] fastPFORDecode(FastPFOR128 codec, int[] encoded, int[] output) {
     IntWrapper inPos = new IntWrapper(0);
     IntWrapper outPos = new IntWrapper(0);
     codec.uncompress(encoded, inPos, encoded.length, output, outPos);
diff --git a/core/src/main/java/org/apache/iceberg/mumbling/BitPacking.java b/core/src/main/java/org/apache/iceberg/mumbling/BitPacking.java
index 9333ac2c24c8..080d9556504b 100644
--- a/core/src/main/java/org/apache/iceberg/mumbling/BitPacking.java
+++ b/core/src/main/java/org/apache/iceberg/mumbling/BitPacking.java
@@ -21,479 +21,748 @@
 import java.nio.ByteBuffer;
 
 /**
- * MSB-first bit packing and unpacking for values of 1–7 bits.
+ * Bit packing and unpacking for values of 1–7 bits.
  *
- * <p>Values are packed in groups of 8: each group of 8 values occupies exactly {@code b} bytes.
- * A trailing partial group of {@code rem = count & 7} values is left-aligned in {@code
- * ceil(rem * b / 8)} bytes, padded with zero bits.
- *
- * <p>Each specialized method (b=1..7) has a compile-time-constant bit width, allowing the JIT to
- * fold shift amounts to immediates and inline the byte sequence, eliminating the inner loop and
- * its loop-carried shift dependency.
+ * <p>The least-significant bits of each value are packed with the first value occupying the most
+ * significant bits of the output. Output is padded to the nearest byte with 0s. For example,
+ * packing values [0b11, 0b10, 0b01] with width 2 produces 0b11100100.
  */
 class BitPacking {
 
   private BitPacking() {}
 
   /**
-   * Packs {@code count} values from {@code values[valPos..]} into {@code out} starting at absolute
-   * position {@code outPos}, using {@code b} bits per value (1–7).
+   * Packs {@code width} least-significant bits of {@code count} values into a data buffer.
+   *
+   * <p>Output is padded to the nearest byte with 0s.
+   *
+   * <p>Values are written to the buffer's underlying storage, but the buffer's position and limit
+   * are not modified.
+   *
+   * @param width number of bits of each value to pack
+   * @param values array containing source values to pack
+   * @param valueOffset starting index of values to pack
+   * @param data an output {@link ByteBuffer}
+   * @param dataOffset starting index for output in the data buffer
+   * @param count the number of values to pack
+   * @return the number of bytes written to the data buffer
+   */
+  static int packBits(
+      int width, int[] values, int valueOffset, ByteBuffer data, int dataOffset, int count) {
+    return switch (width) {
+      case 0 -> 0;
+      case 1 -> packBits1(values, valueOffset, data, dataOffset, count);
+      case 2 -> packBits2(values, valueOffset, data, dataOffset, count);
+      case 3 -> packBits3(values, valueOffset, data, dataOffset, count);
+      case 4 -> packBits4(values, valueOffset, data, dataOffset, count);
+      case 5 -> packBits5(values, valueOffset, data, dataOffset, count);
+      case 6 -> packBits6(values, valueOffset, data, dataOffset, count);
+      case 7 -> packBits7(values, valueOffset, data, dataOffset, count);
+      case 8 -> copyAsBytes(values, valueOffset, data, dataOffset, count);
+      default -> throw new IllegalArgumentException("Invalid bit width: " + width);
+    };
+  }
+
+  /**
+   * Unpacks {@code count} values from a data buffer containing {@code width} bits of each value.
+   *
+   * <p>Unused bits in the last input byte are ignored.
+   *
+   * <p>The input buffer's position and limit are not modified.
+   *
+   * @param width number of bits of each value to unpack
+   * @param data an input {@link ByteBuffer}
+   * @param dataOffset starting index for input in the data buffer
+   * @param output array for unpacked output values
+   * @param outputOffset starting index to store values in the output array
+   * @param count the number of values to unpack
+   * @return the number of bytes read from the data buffer
+   */
+  static int unpackBits(
+      int width, ByteBuffer data, int dataOffset, int[] output, int outputOffset, int count) {
+    return switch (width) {
+      case 0 -> 0;
+      case 1 -> unpackBits1(data, dataOffset, output, outputOffset, count);
+      case 2 -> unpackBits2(data, dataOffset, output, outputOffset, count);
+      case 3 -> unpackBits3(data, dataOffset, output, outputOffset, count);
+      case 4 -> unpackBits4(data, dataOffset, output, outputOffset, count);
+      case 5 -> unpackBits5(data, dataOffset, output, outputOffset, count);
+      case 6 -> unpackBits6(data, dataOffset, output, outputOffset, count);
+      case 7 -> unpackBits7(data, dataOffset, output, outputOffset, count);
+      case 8 -> copyAsBytes(data, dataOffset, output, outputOffset, count);
+      default -> throw new IllegalArgumentException("Invalid bit width: " + width);
+    };
+  }
+
+  /**
+   * Copy byte values from src into a buffer.
+   *
+   * <p>Values must be bytes stored in an integer array. The 3 most significant bytes of the values
+   * are ignored.
+   *
+   * @param source array of source values to copy
+   * @param sourceOffset starting offset of values to copy
+   * @param out output buffer values will be copied to
+   * @param outOffset starting offset in the output buffer
+   * @param count number of values (bytes) to copy
+   * @return the number of bytes written to the buffer
    */
-  static void packBits(
-      int[] values, int valPos, int count, ByteBuffer out, int outPos, int b) {
-    switch (b) {
-      case 1:
-        packBits1(values, valPos, count, out, outPos);
-        break;
-      case 2:
-        packBits2(values, valPos, count, out, outPos);
-        break;
-      case 3:
-        packBits3(values, valPos, count, out, outPos);
-        break;
-      case 4:
-        packBits4(values, valPos, count, out, outPos);
-        break;
-      case 5:
-        packBits5(values, valPos, count, out, outPos);
-        break;
-      case 6:
-        packBits6(values, valPos, count, out, outPos);
-        break;
-      case 7:
-        packBits7(values, valPos, count, out, outPos);
-        break;
-      default:
-        throw new IllegalArgumentException("Invalid bit width: " + b);
+  private static int copyAsBytes(
+      int[] source, int sourceOffset, ByteBuffer out, int outOffset, int count) {
+    for (int i = 0; i < count; i += 1) {
+      out.put(outOffset + i, (byte) source[sourceOffset + i]);
     }
+
+    return count;
   }
 
   /**
-   * Unpacks {@code count} values from {@code data} starting at absolute position {@code dataPos}
-   * into {@code output}, using {@code b} bits per value (1–7).
+   * Copy byte values from a buffer into an int[].
+   *
+   * @param data buffer of source values to copyß
+   * @param dataOffset starting offset in the input buffer
+   * @param out output array values will be copied to
+   * @param outOffset starting offset in the output buffer
+   * @param count number of values (bytes) to copy
+   * @return the number of bytes read from the buffer
    */
-  static void unpackBits(ByteBuffer data, int dataPos, int[] output, int count, int b) {
-    switch (b) {
-      case 1:
-        unpackBits1(data, dataPos, output, count);
-        break;
-      case 2:
-        unpackBits2(data, dataPos, output, count);
-        break;
-      case 3:
-        unpackBits3(data, dataPos, output, count);
-        break;
-      case 4:
-        unpackBits4(data, dataPos, output, count);
-        break;
-      case 5:
-        unpackBits5(data, dataPos, output, count);
-        break;
-      case 6:
-        unpackBits6(data, dataPos, output, count);
-        break;
-      case 7:
-        unpackBits7(data, dataPos, output, count);
-        break;
-      default:
-        throw new IllegalArgumentException("Invalid bit width: " + b);
+  private static int copyAsBytes(
+      ByteBuffer data, int dataOffset, int[] out, int outOffset, int count) {
+    for (int i = 0; i < count; i += 1) {
+      out[outOffset + i] = (data.get(dataOffset + i) & 0xFF);
     }
+
+    return count;
   }
 
   // ---------------------------------------------------------------------------
-  // Specialized pack: b=1..7
-  // Each method packs 8 values into exactly b bytes (full groups), plus a
-  // partial group of rem < 8 values left-aligned in ceil(rem*b/8) bytes.
+  // Specialized pack: width=1..7
+  // Each method packs 8 values into exactly width bytes (full groups), plus a
+  // partial group of remaining < 8 values in ceil(remaining*width/8) bytes.
   // ---------------------------------------------------------------------------
 
-  /** 1-bit values: 8 values → 1 byte. */
-  private static void packBits1(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int vBase = valPos + (g << 3);
-      out.put(
-          outPos + g,
-          (byte)
-              (((values[vBase] & 1) << 7)
-                  | ((values[vBase + 1] & 1) << 6)
-                  | ((values[vBase + 2] & 1) << 5)
-                  | ((values[vBase + 3] & 1) << 4)
-                  | ((values[vBase + 4] & 1) << 3)
-                  | ((values[vBase + 5] & 1) << 2)
-                  | ((values[vBase + 6] & 1) << 1)
-                  | (values[vBase + 7] & 1)));
-    }
-    int rem = count & 7;
-    if (rem > 0) {
-      int vBase = valPos + (fullGroups << 3);
-      long word = 0;
-      for (int k = 0; k < rem; k += 1) {
-        word = (word << 1) | (values[vBase + k] & 1);
+  /** 1-bit values: 8 values into 1 byte. */
+  private static int packBits1(
+      int[] values, int valueOffset, ByteBuffer data, int dataOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = valueOffset + 8 * group;
+      int outputOffset = dataOffset + group;
+      int word =
+          packWord1(
+              values[groupOffset],
+              values[groupOffset + 1],
+              values[groupOffset + 2],
+              values[groupOffset + 3],
+              values[groupOffset + 4],
+              values[groupOffset + 5],
+              values[groupOffset + 6],
+              values[groupOffset + 7]);
+      data.put(outputOffset, (byte) word);
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      int groupOffset = valueOffset + 8 * fullGroups;
+      int outputOffset = dataOffset + fullGroups;
+      int word =
+          packWord1(
+              values[groupOffset],
+              remaining > 1 ? values[groupOffset + 1] : 0,
+              remaining > 2 ? values[groupOffset + 2] : 0,
+              remaining > 3 ? values[groupOffset + 3] : 0,
+              remaining > 4 ? values[groupOffset + 4] : 0,
+              remaining > 5 ? values[groupOffset + 5] : 0,
+              remaining > 6 ? values[groupOffset + 6] : 0,
+              0);
+      data.put(outputOffset, (byte) word);
+    }
+
+    return byteWidth(count);
+  }
+
+  private static int packWord1(int a, int b, int c, int d, int e, int f, int g, int h) {
+    return ((a & 0b1) << 7)
+        | ((b & 0b1) << 6)
+        | ((c & 0b1) << 5)
+        | ((d & 0b1) << 4)
+        | ((e & 0b1) << 3)
+        | ((f & 0b1) << 2)
+        | ((g & 0b1) << 1)
+        | (h & 0b1);
+  }
+
+  /** 2-bit values: 8 values into 2 bytes. */
+  private static int packBits2(
+      int[] values, int valueOffset, ByteBuffer data, int dataOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = valueOffset + 8 * group;
+      int outputOffset = dataOffset + 2 * group;
+      int word =
+          packWord2(
+              values[groupOffset],
+              values[groupOffset + 1],
+              values[groupOffset + 2],
+              values[groupOffset + 3],
+              values[groupOffset + 4],
+              values[groupOffset + 5],
+              values[groupOffset + 6],
+              values[groupOffset + 7]);
+      data.put(outputOffset, (byte) (word >>> 8));
+      data.put(outputOffset + 1, (byte) word);
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      int groupOffset = valueOffset + 8 * fullGroups;
+      int outputOffset = dataOffset + 2 * fullGroups;
+      int word =
+          packWord2(
+              values[groupOffset],
+              remaining > 1 ? values[groupOffset + 1] : 0,
+              remaining > 2 ? values[groupOffset + 2] : 0,
+              remaining > 3 ? values[groupOffset + 3] : 0,
+              remaining > 4 ? values[groupOffset + 4] : 0,
+              remaining > 5 ? values[groupOffset + 5] : 0,
+              remaining > 6 ? values[groupOffset + 6] : 0,
+              0);
+      data.put(outputOffset, (byte) (word >>> 8));
+      if (remaining > 4) {
+        data.put(outputOffset + 1, (byte) word);
       }
-      out.put(outPos + fullGroups, (byte) (word << (8 - rem)));
     }
+
+    return byteWidth(2 * count);
   }
 
-  /** 2-bit values: 8 values → 2 bytes. */
-  private static void packBits2(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int vBase = valPos + (g << 3);
-      int oBase = outPos + (g << 1);
+  private static int packWord2(int a, int b, int c, int d, int e, int f, int g, int h) {
+    return ((a & 0b11) << 14)
+        | ((b & 0b11) << 12)
+        | ((c & 0b11) << 10)
+        | ((d & 0b11) << 8)
+        | ((e & 0b11) << 6)
+        | ((f & 0b11) << 4)
+        | ((g & 0b11) << 2)
+        | (h & 0b11);
+  }
+
+  /** 3-bit values: 8 values into 3 bytes. */
+  private static int packBits3(
+      int[] values, int valueOffset, ByteBuffer data, int dataOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = valueOffset + 8 * group;
+      int outputOffset = dataOffset + 3 * group;
+      int word =
+          packWord3(
+              values[groupOffset],
+              values[groupOffset + 1],
+              values[groupOffset + 2],
+              values[groupOffset + 3],
+              values[groupOffset + 4],
+              values[groupOffset + 5],
+              values[groupOffset + 6],
+              values[groupOffset + 7]);
+      data.put(outputOffset, (byte) (word >>> 16));
+      data.put(outputOffset + 1, (byte) (word >>> 8));
+      data.put(outputOffset + 2, (byte) word);
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      int groupOffset = valueOffset + 8 * fullGroups;
+      int outputOffset = dataOffset + 3 * fullGroups;
       int word =
-          ((values[vBase] & 3) << 14)
-              | ((values[vBase + 1] & 3) << 12)
-              | ((values[vBase + 2] & 3) << 10)
-              | ((values[vBase + 3] & 3) << 8)
-              | ((values[vBase + 4] & 3) << 6)
-              | ((values[vBase + 5] & 3) << 4)
-              | ((values[vBase + 6] & 3) << 2)
-              | (values[vBase + 7] & 3);
-      out.put(oBase, (byte) (word >>> 8));
-      out.put(oBase + 1, (byte) word);
-    }
-    packRemainder(values, valPos, count, out, outPos, fullGroups, 2, 3);
+          packWord3(
+              values[groupOffset],
+              remaining > 1 ? values[groupOffset + 1] : 0,
+              remaining > 2 ? values[groupOffset + 2] : 0,
+              remaining > 3 ? values[groupOffset + 3] : 0,
+              remaining > 4 ? values[groupOffset + 4] : 0,
+              remaining > 5 ? values[groupOffset + 5] : 0,
+              remaining > 6 ? values[groupOffset + 6] : 0,
+              0);
+      int byteCount = byteWidth(3 * remaining);
+      for (int k = 0; k < byteCount; k += 1) {
+        data.put(outputOffset + k, (byte) (word >>> (16 - 8 * k)));
+      }
+    }
+
+    return byteWidth(3 * count);
   }
 
-  /** 3-bit values: 8 values → 3 bytes. */
-  private static void packBits3(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int vBase = valPos + (g << 3);
-      int oBase = outPos + g * 3;
+  private static int packWord3(int a, int b, int c, int d, int e, int f, int g, int h) {
+    return ((a & 0b111) << 21)
+        | ((b & 0b111) << 18)
+        | ((c & 0b111) << 15)
+        | ((d & 0b111) << 12)
+        | ((e & 0b111) << 9)
+        | ((f & 0b111) << 6)
+        | ((g & 0b111) << 3)
+        | (h & 0b111);
+  }
+
+  /** 4-bit values: 8 values into 4 bytes. */
+  private static int packBits4(
+      int[] values, int valueOffset, ByteBuffer data, int dataOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = valueOffset + 8 * group;
+      int outputOffset = dataOffset + 4 * group;
+      int word =
+          packWord4(
+              values[groupOffset],
+              values[groupOffset + 1],
+              values[groupOffset + 2],
+              values[groupOffset + 3],
+              values[groupOffset + 4],
+              values[groupOffset + 5],
+              values[groupOffset + 6],
+              values[groupOffset + 7]);
+      data.put(outputOffset, (byte) (word >>> 24));
+      data.put(outputOffset + 1, (byte) (word >>> 16));
+      data.put(outputOffset + 2, (byte) (word >>> 8));
+      data.put(outputOffset + 3, (byte) word);
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      int groupOffset = valueOffset + 8 * fullGroups;
+      int outputOffset = dataOffset + 4 * fullGroups;
       int word =
-          ((values[vBase] & 7) << 21)
-              | ((values[vBase + 1] & 7) << 18)
-              | ((values[vBase + 2] & 7) << 15)
-              | ((values[vBase + 3] & 7) << 12)
-              | ((values[vBase + 4] & 7) << 9)
-              | ((values[vBase + 5] & 7) << 6)
-              | ((values[vBase + 6] & 7) << 3)
-              | (values[vBase + 7] & 7);
-      out.put(oBase, (byte) (word >>> 16));
-      out.put(oBase + 1, (byte) (word >>> 8));
-      out.put(oBase + 2, (byte) word);
-    }
-    packRemainder(values, valPos, count, out, outPos, fullGroups, 3, 7);
+          packWord4(
+              values[groupOffset],
+              remaining > 1 ? values[groupOffset + 1] : 0,
+              remaining > 2 ? values[groupOffset + 2] : 0,
+              remaining > 3 ? values[groupOffset + 3] : 0,
+              remaining > 4 ? values[groupOffset + 4] : 0,
+              remaining > 5 ? values[groupOffset + 5] : 0,
+              remaining > 6 ? values[groupOffset + 6] : 0,
+              0);
+      int byteCount = byteWidth(4 * remaining);
+      for (int k = 0; k < byteCount; k += 1) {
+        data.put(outputOffset + k, (byte) (word >>> (24 - 8 * k)));
+      }
+    }
+
+    return byteWidth(4 * count);
   }
 
-  /** 4-bit values: 8 values → 4 bytes. */
-  private static void packBits4(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int vBase = valPos + (g << 3);
-      int oBase = outPos + (g << 2);
-      long word =
-          ((long) (values[vBase] & 15) << 28)
-              | ((long) (values[vBase + 1] & 15) << 24)
-              | ((long) (values[vBase + 2] & 15) << 20)
-              | ((long) (values[vBase + 3] & 15) << 16)
-              | ((long) (values[vBase + 4] & 15) << 12)
-              | ((long) (values[vBase + 5] & 15) << 8)
-              | ((long) (values[vBase + 6] & 15) << 4)
-              | (long) (values[vBase + 7] & 15);
-      out.put(oBase, (byte) (word >>> 24));
-      out.put(oBase + 1, (byte) (word >>> 16));
-      out.put(oBase + 2, (byte) (word >>> 8));
-      out.put(oBase + 3, (byte) word);
-    }
-    packRemainder(values, valPos, count, out, outPos, fullGroups, 4, 15);
+  private static int packWord4(int a, int b, int c, int d, int e, int f, int g, int h) {
+    return ((a & 0b1111) << 28)
+        | ((b & 0b1111) << 24)
+        | ((c & 0b1111) << 20)
+        | ((d & 0b1111) << 16)
+        | ((e & 0b1111) << 12)
+        | ((f & 0b1111) << 8)
+        | ((g & 0b1111) << 4)
+        | (h & 0b1111);
   }
 
-  /** 5-bit values: 8 values → 5 bytes. */
-  private static void packBits5(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int vBase = valPos + (g << 3);
-      int oBase = outPos + g * 5;
+  /** 5-bit values: 8 values into 5 bytes. */
+  private static int packBits5(
+      int[] values, int valueOffset, ByteBuffer data, int dataOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = valueOffset + 8 * group;
+      int outputOffset = dataOffset + 5 * group;
       long word =
-          ((long) (values[vBase] & 31) << 35)
-              | ((long) (values[vBase + 1] & 31) << 30)
-              | ((long) (values[vBase + 2] & 31) << 25)
-              | ((long) (values[vBase + 3] & 31) << 20)
-              | ((long) (values[vBase + 4] & 31) << 15)
-              | ((long) (values[vBase + 5] & 31) << 10)
-              | ((long) (values[vBase + 6] & 31) << 5)
-              | (long) (values[vBase + 7] & 31);
-      out.put(oBase, (byte) (word >>> 32));
-      out.put(oBase + 1, (byte) (word >>> 24));
-      out.put(oBase + 2, (byte) (word >>> 16));
-      out.put(oBase + 3, (byte) (word >>> 8));
-      out.put(oBase + 4, (byte) word);
-    }
-    packRemainder(values, valPos, count, out, outPos, fullGroups, 5, 31);
-  }
+          packWord5(
+              values[groupOffset],
+              values[groupOffset + 1],
+              values[groupOffset + 2],
+              values[groupOffset + 3],
+              values[groupOffset + 4],
+              values[groupOffset + 5],
+              values[groupOffset + 6],
+              values[groupOffset + 7]);
+      data.put(outputOffset, (byte) (word >>> 32));
+      data.put(outputOffset + 1, (byte) (word >>> 24));
+      data.put(outputOffset + 2, (byte) (word >>> 16));
+      data.put(outputOffset + 3, (byte) (word >>> 8));
+      data.put(outputOffset + 4, (byte) word);
+    }
 
-  /** 6-bit values: 8 values → 6 bytes. */
-  private static void packBits6(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int vBase = valPos + (g << 3);
-      int oBase = outPos + g * 6;
+    int remaining = count % 8;
+    if (remaining > 0) {
+      int groupOffset = valueOffset + 8 * fullGroups;
+      int outputOffset = dataOffset + 5 * fullGroups;
       long word =
-          ((long) (values[vBase] & 63) << 42)
-              | ((long) (values[vBase + 1] & 63) << 36)
-              | ((long) (values[vBase + 2] & 63) << 30)
-              | ((long) (values[vBase + 3] & 63) << 24)
-              | ((long) (values[vBase + 4] & 63) << 18)
-              | ((long) (values[vBase + 5] & 63) << 12)
-              | ((long) (values[vBase + 6] & 63) << 6)
-              | (long) (values[vBase + 7] & 63);
-      out.put(oBase, (byte) (word >>> 40));
-      out.put(oBase + 1, (byte) (word >>> 32));
-      out.put(oBase + 2, (byte) (word >>> 24));
-      out.put(oBase + 3, (byte) (word >>> 16));
-      out.put(oBase + 4, (byte) (word >>> 8));
-      out.put(oBase + 5, (byte) word);
-    }
-    packRemainder(values, valPos, count, out, outPos, fullGroups, 6, 63);
+          packWord5(
+              values[groupOffset],
+              remaining > 1 ? values[groupOffset + 1] : 0,
+              remaining > 2 ? values[groupOffset + 2] : 0,
+              remaining > 3 ? values[groupOffset + 3] : 0,
+              remaining > 4 ? values[groupOffset + 4] : 0,
+              remaining > 5 ? values[groupOffset + 5] : 0,
+              remaining > 6 ? values[groupOffset + 6] : 0,
+              0);
+      int byteCount = byteWidth(5 * remaining);
+      for (int k = 0; k < byteCount; k += 1) {
+        data.put(outputOffset + k, (byte) (word >>> (32 - 8 * k)));
+      }
+    }
+
+    return byteWidth(5 * count);
   }
 
-  /** 7-bit values: 8 values → 7 bytes. */
-  private static void packBits7(int[] values, int valPos, int count, ByteBuffer out, int outPos) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int vBase = valPos + (g << 3);
-      int oBase = outPos + g * 7;
-      long word =
-          ((long) (values[vBase] & 127) << 49)
-              | ((long) (values[vBase + 1] & 127) << 42)
-              | ((long) (values[vBase + 2] & 127) << 35)
-              | ((long) (values[vBase + 3] & 127) << 28)
-              | ((long) (values[vBase + 4] & 127) << 21)
-              | ((long) (values[vBase + 5] & 127) << 14)
-              | ((long) (values[vBase + 6] & 127) << 7)
-              | (long) (values[vBase + 7] & 127);
-      out.put(oBase, (byte) (word >>> 48));
-      out.put(oBase + 1, (byte) (word >>> 40));
-      out.put(oBase + 2, (byte) (word >>> 32));
-      out.put(oBase + 3, (byte) (word >>> 24));
-      out.put(oBase + 4, (byte) (word >>> 16));
-      out.put(oBase + 5, (byte) (word >>> 8));
-      out.put(oBase + 6, (byte) word);
-    }
-    packRemainder(values, valPos, count, out, outPos, fullGroups, 7, 127);
+  private static long packWord5(int a, int b, int c, int d, int e, int f, int g, int h) {
+    return ((long) (a & 0b11111) << 35)
+        | ((long) (b & 0b11111) << 30)
+        | ((long) (c & 0b11111) << 25)
+        | ((long) (d & 0b11111) << 20)
+        | ((long) (e & 0b11111) << 15)
+        | ((long) (f & 0b11111) << 10)
+        | ((long) (g & 0b11111) << 5)
+        | (long) (h & 0b11111);
   }
 
-  /**
-   * Packs the final partial group (rem = count & 7 values) for bit widths 2–7. Values are
-   * left-aligned in {@code ceil(rem * bitsPerValue / 8)} bytes.
-   */
-  private static void packRemainder(
-      int[] values,
-      int valPos,
-      int count,
-      ByteBuffer out,
-      int outPos,
-      int fullGroups,
-      int bitsPerValue,
-      int mask) {
-    int rem = count & 7;
-    if (rem > 0) {
-      int vBase = valPos + (fullGroups << 3);
-      int oBase = outPos + fullGroups * bitsPerValue;
-      long word = 0;
-      for (int k = 0; k < rem; k += 1) {
-        word = (word << bitsPerValue) | (values[vBase + k] & mask);
+  /** 6-bit values: 8 values into 6 bytes. */
+  private static int packBits6(
+      int[] values, int valueOffset, ByteBuffer data, int dataOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = valueOffset + 8 * group;
+      int outputOffset = dataOffset + 6 * group;
+      long word =
+          packWord6(
+              values[groupOffset],
+              values[groupOffset + 1],
+              values[groupOffset + 2],
+              values[groupOffset + 3],
+              values[groupOffset + 4],
+              values[groupOffset + 5],
+              values[groupOffset + 6],
+              values[groupOffset + 7]);
+      data.put(outputOffset, (byte) (word >>> 40));
+      data.put(outputOffset + 1, (byte) (word >>> 32));
+      data.put(outputOffset + 2, (byte) (word >>> 24));
+      data.put(outputOffset + 3, (byte) (word >>> 16));
+      data.put(outputOffset + 4, (byte) (word >>> 8));
+      data.put(outputOffset + 5, (byte) word);
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      int groupOffset = valueOffset + 8 * fullGroups;
+      int outputOffset = dataOffset + 6 * fullGroups;
+      long word =
+          packWord6(
+              values[groupOffset],
+              remaining > 1 ? values[groupOffset + 1] : 0,
+              remaining > 2 ? values[groupOffset + 2] : 0,
+              remaining > 3 ? values[groupOffset + 3] : 0,
+              remaining > 4 ? values[groupOffset + 4] : 0,
+              remaining > 5 ? values[groupOffset + 5] : 0,
+              remaining > 6 ? values[groupOffset + 6] : 0,
+              0);
+      int byteCount = byteWidth(6 * remaining);
+      for (int k = 0; k < byteCount; k += 1) {
+        data.put(outputOffset + k, (byte) (word >>> (40 - 8 * k)));
       }
-      int remBits = rem * bitsPerValue;
-      int remBytes = (remBits + 7) >>> 3;
-      word <<= (remBytes << 3) - remBits;
-      for (int k = remBytes - 1; k >= 0; k--) {
-        out.put(oBase + k, (byte) word);
-        word >>>= 8;
+    }
+
+    return byteWidth(6 * count);
+  }
+
+  private static long packWord6(int a, int b, int c, int d, int e, int f, int g, int h) {
+    return ((long) (a & 0b111111) << 42)
+        | ((long) (b & 0b111111) << 36)
+        | ((long) (c & 0b111111) << 30)
+        | ((long) (d & 0b111111) << 24)
+        | ((long) (e & 0b111111) << 18)
+        | ((long) (f & 0b111111) << 12)
+        | ((long) (g & 0b111111) << 6)
+        | (long) (h & 0b111111);
+  }
+
+  /** 7-bit values: 8 values into 7 bytes. */
+  private static int packBits7(
+      int[] values, int valueOffset, ByteBuffer data, int dataOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = valueOffset + 8 * group;
+      int outputOffset = dataOffset + 7 * group;
+      long word =
+          packWord7(
+              values[groupOffset],
+              values[groupOffset + 1],
+              values[groupOffset + 2],
+              values[groupOffset + 3],
+              values[groupOffset + 4],
+              values[groupOffset + 5],
+              values[groupOffset + 6],
+              values[groupOffset + 7]);
+      data.put(outputOffset, (byte) (word >>> 48));
+      data.put(outputOffset + 1, (byte) (word >>> 40));
+      data.put(outputOffset + 2, (byte) (word >>> 32));
+      data.put(outputOffset + 3, (byte) (word >>> 24));
+      data.put(outputOffset + 4, (byte) (word >>> 16));
+      data.put(outputOffset + 5, (byte) (word >>> 8));
+      data.put(outputOffset + 6, (byte) word);
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      int groupOffset = valueOffset + 8 * fullGroups;
+      int outputOffset = dataOffset + 7 * fullGroups;
+      long word =
+          packWord7(
+              values[groupOffset],
+              remaining > 1 ? values[groupOffset + 1] : 0,
+              remaining > 2 ? values[groupOffset + 2] : 0,
+              remaining > 3 ? values[groupOffset + 3] : 0,
+              remaining > 4 ? values[groupOffset + 4] : 0,
+              remaining > 5 ? values[groupOffset + 5] : 0,
+              remaining > 6 ? values[groupOffset + 6] : 0,
+              0);
+      int byteCount = byteWidth(7 * remaining);
+      for (int k = 0; k < byteCount; k += 1) {
+        data.put(outputOffset + k, (byte) (word >>> (48 - 8 * k)));
       }
     }
+
+    return byteWidth(7 * count);
+  }
+
+  private static long packWord7(int a, int b, int c, int d, int e, int f, int g, int h) {
+    return ((long) (a & 0b1111111) << 49)
+        | ((long) (b & 0b1111111) << 42)
+        | ((long) (c & 0b1111111) << 35)
+        | ((long) (d & 0b1111111) << 28)
+        | ((long) (e & 0b1111111) << 21)
+        | ((long) (f & 0b1111111) << 14)
+        | ((long) (g & 0b1111111) << 7)
+        | (long) (h & 0b1111111);
   }
 
   // ---------------------------------------------------------------------------
-  // Specialized unpack: b=1..7
+  // Specialized unpack: width=1..7
   // ---------------------------------------------------------------------------
 
-  /** 1-bit values: 1 byte → 8 values. */
-  private static void unpackBits1(ByteBuffer data, int dataPos, int[] output, int count) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int oBase = g << 3;
-      int w = data.get(dataPos + g) & 0xFF;
-      output[oBase] = (w >>> 7) & 1;
-      output[oBase + 1] = (w >>> 6) & 1;
-      output[oBase + 2] = (w >>> 5) & 1;
-      output[oBase + 3] = (w >>> 4) & 1;
-      output[oBase + 4] = (w >>> 3) & 1;
-      output[oBase + 5] = (w >>> 2) & 1;
-      output[oBase + 6] = (w >>> 1) & 1;
-      output[oBase + 7] = w & 1;
-    }
-    unpackRemainder(data, dataPos, output, count, fullGroups, 1, 1);
+  /** 1-bit values: 1 byte into 8 values. */
+  private static int unpackBits1(
+      ByteBuffer data, int dataOffset, int[] output, int outputOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = outputOffset + 8 * group;
+      int word = (int) readWord(data, dataOffset + group, 1);
+      output[groupOffset] = (word >>> 7) & 0b1;
+      output[groupOffset + 1] = (word >>> 6) & 0b1;
+      output[groupOffset + 2] = (word >>> 5) & 0b1;
+      output[groupOffset + 3] = (word >>> 4) & 0b1;
+      output[groupOffset + 4] = (word >>> 3) & 0b1;
+      output[groupOffset + 5] = (word >>> 2) & 0b1;
+      output[groupOffset + 6] = (word >>> 1) & 0b1;
+      output[groupOffset + 7] = word & 0b1;
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      long word = readWord(data, dataOffset + fullGroups, byteWidth(remaining));
+      unpackRemainder(1, word, output, outputOffset + 8 * fullGroups, remaining);
+    }
+
+    return byteWidth(count);
   }
 
-  /** 2-bit values: 2 bytes → 8 values. */
-  private static void unpackBits2(ByteBuffer data, int dataPos, int[] output, int count) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int oBase = g << 3;
-      int dBase = dataPos + (g << 1);
-      int w = ((data.get(dBase) & 0xFF) << 8) | (data.get(dBase + 1) & 0xFF);
-      output[oBase] = (w >>> 14) & 3;
-      output[oBase + 1] = (w >>> 12) & 3;
-      output[oBase + 2] = (w >>> 10) & 3;
-      output[oBase + 3] = (w >>> 8) & 3;
-      output[oBase + 4] = (w >>> 6) & 3;
-      output[oBase + 5] = (w >>> 4) & 3;
-      output[oBase + 6] = (w >>> 2) & 3;
-      output[oBase + 7] = w & 3;
-    }
-    unpackRemainder(data, dataPos, output, count, fullGroups, 2, 3);
+  /** 2-bit values: 2 bytes into 8 values. */
+  private static int unpackBits2(
+      ByteBuffer data, int dataOffset, int[] output, int outputOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = outputOffset + 8 * group;
+      int word = (int) readWord(data, dataOffset + 2 * group, 2);
+      output[groupOffset] = (word >>> 14) & 0b11;
+      output[groupOffset + 1] = (word >>> 12) & 0b11;
+      output[groupOffset + 2] = (word >>> 10) & 0b11;
+      output[groupOffset + 3] = (word >>> 8) & 0b11;
+      output[groupOffset + 4] = (word >>> 6) & 0b11;
+      output[groupOffset + 5] = (word >>> 4) & 0b11;
+      output[groupOffset + 6] = (word >>> 2) & 0b11;
+      output[groupOffset + 7] = word & 0b11;
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      long word = readWord(data, dataOffset + 2 * fullGroups, byteWidth(2 * remaining));
+      unpackRemainder(2, word, output, outputOffset + 8 * fullGroups, remaining);
+    }
+
+    return byteWidth(2 * count);
   }
 
-  /** 3-bit values: 3 bytes → 8 values. */
-  private static void unpackBits3(ByteBuffer data, int dataPos, int[] output, int count) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int oBase = g << 3;
-      int dBase = dataPos + g * 3;
-      int w =
-          ((data.get(dBase) & 0xFF) << 16)
-              | ((data.get(dBase + 1) & 0xFF) << 8)
-              | (data.get(dBase + 2) & 0xFF);
-      output[oBase] = (w >>> 21) & 7;
-      output[oBase + 1] = (w >>> 18) & 7;
-      output[oBase + 2] = (w >>> 15) & 7;
-      output[oBase + 3] = (w >>> 12) & 7;
-      output[oBase + 4] = (w >>> 9) & 7;
-      output[oBase + 5] = (w >>> 6) & 7;
-      output[oBase + 6] = (w >>> 3) & 7;
-      output[oBase + 7] = w & 7;
-    }
-    unpackRemainder(data, dataPos, output, count, fullGroups, 3, 7);
+  /** 3-bit values: 3 bytes into 8 values. */
+  private static int unpackBits3(
+      ByteBuffer data, int dataOffset, int[] output, int outputOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = outputOffset + 8 * group;
+      int word = (int) readWord(data, dataOffset + 3 * group, 3);
+      output[groupOffset] = (word >>> 21) & 0b111;
+      output[groupOffset + 1] = (word >>> 18) & 0b111;
+      output[groupOffset + 2] = (word >>> 15) & 0b111;
+      output[groupOffset + 3] = (word >>> 12) & 0b111;
+      output[groupOffset + 4] = (word >>> 9) & 0b111;
+      output[groupOffset + 5] = (word >>> 6) & 0b111;
+      output[groupOffset + 6] = (word >>> 3) & 0b111;
+      output[groupOffset + 7] = word & 0b111;
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      long word = readWord(data, dataOffset + 3 * fullGroups, byteWidth(3 * remaining));
+      unpackRemainder(3, word, output, outputOffset + 8 * fullGroups, remaining);
+    }
+
+    return byteWidth(3 * count);
   }
 
-  /** 4-bit values: 4 bytes → 8 values. */
-  private static void unpackBits4(ByteBuffer data, int dataPos, int[] output, int count) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int oBase = g << 3;
-      int dBase = dataPos + (g << 2);
-      long w =
-          ((long) (data.get(dBase) & 0xFF) << 24)
-              | ((long) (data.get(dBase + 1) & 0xFF) << 16)
-              | ((long) (data.get(dBase + 2) & 0xFF) << 8)
-              | (long) (data.get(dBase + 3) & 0xFF);
-      output[oBase] = (int) (w >>> 28) & 15;
-      output[oBase + 1] = (int) (w >>> 24) & 15;
-      output[oBase + 2] = (int) (w >>> 20) & 15;
-      output[oBase + 3] = (int) (w >>> 16) & 15;
-      output[oBase + 4] = (int) (w >>> 12) & 15;
-      output[oBase + 5] = (int) (w >>> 8) & 15;
-      output[oBase + 6] = (int) (w >>> 4) & 15;
-      output[oBase + 7] = (int) w & 15;
-    }
-    unpackRemainder(data, dataPos, output, count, fullGroups, 4, 15);
+  /** 4-bit values: 4 bytes into 8 values. */
+  private static int unpackBits4(
+      ByteBuffer data, int dataOffset, int[] output, int outputOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = outputOffset + 8 * group;
+      long word = readWord(data, dataOffset + 4 * group, 4);
+      output[groupOffset] = (int) (word >>> 28) & 0b1111;
+      output[groupOffset + 1] = (int) (word >>> 24) & 0b1111;
+      output[groupOffset + 2] = (int) (word >>> 20) & 0b1111;
+      output[groupOffset + 3] = (int) (word >>> 16) & 0b1111;
+      output[groupOffset + 4] = (int) (word >>> 12) & 0b1111;
+      output[groupOffset + 5] = (int) (word >>> 8) & 0b1111;
+      output[groupOffset + 6] = (int) (word >>> 4) & 0b1111;
+      output[groupOffset + 7] = (int) word & 0b1111;
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      long word = readWord(data, dataOffset + 4 * fullGroups, byteWidth(4 * remaining));
+      unpackRemainder(4, word, output, outputOffset + 8 * fullGroups, remaining);
+    }
+
+    return byteWidth(4 * count);
   }
 
-  /** 5-bit values: 5 bytes → 8 values. */
-  private static void unpackBits5(ByteBuffer data, int dataPos, int[] output, int count) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int oBase = g << 3;
-      int dBase = dataPos + g * 5;
-      long w =
-          ((long) (data.get(dBase) & 0xFF) << 32)
-              | ((long) (data.get(dBase + 1) & 0xFF) << 24)
-              | ((long) (data.get(dBase + 2) & 0xFF) << 16)
-              | ((long) (data.get(dBase + 3) & 0xFF) << 8)
-              | (long) (data.get(dBase + 4) & 0xFF);
-      output[oBase] = (int) (w >>> 35) & 31;
-      output[oBase + 1] = (int) (w >>> 30) & 31;
-      output[oBase + 2] = (int) (w >>> 25) & 31;
-      output[oBase + 3] = (int) (w >>> 20) & 31;
-      output[oBase + 4] = (int) (w >>> 15) & 31;
-      output[oBase + 5] = (int) (w >>> 10) & 31;
-      output[oBase + 6] = (int) (w >>> 5) & 31;
-      output[oBase + 7] = (int) w & 31;
-    }
-    unpackRemainder(data, dataPos, output, count, fullGroups, 5, 31);
+  /** 5-bit values: 5 bytes into 8 values. */
+  private static int unpackBits5(
+      ByteBuffer data, int dataOffset, int[] output, int outputOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = outputOffset + 8 * group;
+      long word = readWord(data, dataOffset + 5 * group, 5);
+      output[groupOffset] = (int) (word >>> 35) & 0b11111;
+      output[groupOffset + 1] = (int) (word >>> 30) & 0b11111;
+      output[groupOffset + 2] = (int) (word >>> 25) & 0b11111;
+      output[groupOffset + 3] = (int) (word >>> 20) & 0b11111;
+      output[groupOffset + 4] = (int) (word >>> 15) & 0b11111;
+      output[groupOffset + 5] = (int) (word >>> 10) & 0b11111;
+      output[groupOffset + 6] = (int) (word >>> 5) & 0b11111;
+      output[groupOffset + 7] = (int) word & 0b11111;
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      long word = readWord(data, dataOffset + 5 * fullGroups, byteWidth(5 * remaining));
+      unpackRemainder(5, word, output, outputOffset + 8 * fullGroups, remaining);
+    }
+
+    return byteWidth(5 * count);
   }
 
-  /** 6-bit values: 6 bytes → 8 values. */
-  private static void unpackBits6(ByteBuffer data, int dataPos, int[] output, int count) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int oBase = g << 3;
-      int dBase = dataPos + g * 6;
-      long w =
-          ((long) (data.get(dBase) & 0xFF) << 40)
-              | ((long) (data.get(dBase + 1) & 0xFF) << 32)
-              | ((long) (data.get(dBase + 2) & 0xFF) << 24)
-              | ((long) (data.get(dBase + 3) & 0xFF) << 16)
-              | ((long) (data.get(dBase + 4) & 0xFF) << 8)
-              | (long) (data.get(dBase + 5) & 0xFF);
-      output[oBase] = (int) (w >>> 42) & 63;
-      output[oBase + 1] = (int) (w >>> 36) & 63;
-      output[oBase + 2] = (int) (w >>> 30) & 63;
-      output[oBase + 3] = (int) (w >>> 24) & 63;
-      output[oBase + 4] = (int) (w >>> 18) & 63;
-      output[oBase + 5] = (int) (w >>> 12) & 63;
-      output[oBase + 6] = (int) (w >>> 6) & 63;
-      output[oBase + 7] = (int) w & 63;
-    }
-    unpackRemainder(data, dataPos, output, count, fullGroups, 6, 63);
+  /** 6-bit values: 6 bytes into 8 values. */
+  private static int unpackBits6(
+      ByteBuffer data, int dataOffset, int[] output, int outputOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = outputOffset + 8 * group;
+      long word = readWord(data, dataOffset + 6 * group, 6);
+      output[groupOffset] = (int) (word >>> 42) & 0b111111;
+      output[groupOffset + 1] = (int) (word >>> 36) & 0b111111;
+      output[groupOffset + 2] = (int) (word >>> 30) & 0b111111;
+      output[groupOffset + 3] = (int) (word >>> 24) & 0b111111;
+      output[groupOffset + 4] = (int) (word >>> 18) & 0b111111;
+      output[groupOffset + 5] = (int) (word >>> 12) & 0b111111;
+      output[groupOffset + 6] = (int) (word >>> 6) & 0b111111;
+      output[groupOffset + 7] = (int) word & 0b111111;
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      long word = readWord(data, dataOffset + 6 * fullGroups, byteWidth(6 * remaining));
+      unpackRemainder(6, word, output, outputOffset + 8 * fullGroups, remaining);
+    }
+
+    return byteWidth(6 * count);
   }
 
-  /** 7-bit values: 7 bytes → 8 values. */
-  private static void unpackBits7(ByteBuffer data, int dataPos, int[] output, int count) {
-    int fullGroups = count >>> 3;
-    for (int g = 0; g < fullGroups; g += 1) {
-      int oBase = g << 3;
-      int dBase = dataPos + g * 7;
-      long w =
-          ((long) (data.get(dBase) & 0xFF) << 48)
-              | ((long) (data.get(dBase + 1) & 0xFF) << 40)
-              | ((long) (data.get(dBase + 2) & 0xFF) << 32)
-              | ((long) (data.get(dBase + 3) & 0xFF) << 24)
-              | ((long) (data.get(dBase + 4) & 0xFF) << 16)
-              | ((long) (data.get(dBase + 5) & 0xFF) << 8)
-              | (long) (data.get(dBase + 6) & 0xFF);
-      output[oBase] = (int) (w >>> 49) & 127;
-      output[oBase + 1] = (int) (w >>> 42) & 127;
-      output[oBase + 2] = (int) (w >>> 35) & 127;
-      output[oBase + 3] = (int) (w >>> 28) & 127;
-      output[oBase + 4] = (int) (w >>> 21) & 127;
-      output[oBase + 5] = (int) (w >>> 14) & 127;
-      output[oBase + 6] = (int) (w >>> 7) & 127;
-      output[oBase + 7] = (int) w & 127;
-    }
-    unpackRemainder(data, dataPos, output, count, fullGroups, 7, 127);
+  /** 7-bit values: 7 bytes into 8 values. */
+  private static int unpackBits7(
+      ByteBuffer data, int dataOffset, int[] output, int outputOffset, int count) {
+    int fullGroups = count / 8;
+    for (int group = 0; group < fullGroups; group += 1) {
+      int groupOffset = outputOffset + 8 * group;
+      long word = readWord(data, dataOffset + 7 * group, 7);
+      output[groupOffset] = (int) (word >>> 49) & 0b1111111;
+      output[groupOffset + 1] = (int) (word >>> 42) & 0b1111111;
+      output[groupOffset + 2] = (int) (word >>> 35) & 0b1111111;
+      output[groupOffset + 3] = (int) (word >>> 28) & 0b1111111;
+      output[groupOffset + 4] = (int) (word >>> 21) & 0b1111111;
+      output[groupOffset + 5] = (int) (word >>> 14) & 0b1111111;
+      output[groupOffset + 6] = (int) (word >>> 7) & 0b1111111;
+      output[groupOffset + 7] = (int) word & 0b1111111;
+    }
+
+    int remaining = count % 8;
+    if (remaining > 0) {
+      long word = readWord(data, dataOffset + 7 * fullGroups, byteWidth(7 * remaining));
+      unpackRemainder(7, word, output, outputOffset + 8 * fullGroups, remaining);
+    }
+
+    return byteWidth(7 * count);
   }
 
   /**
-   * Unpacks the final partial group (rem = count & 7 values) for bit widths 1–7. Reads {@code
-   * ceil(rem * bitsPerValue / 8)} bytes and right-aligns before extracting.
+   * Unpack {@code count < 8} values of {@code width} bits from {@code word}.
+   *
+   * @param width number of bits stored for each value
+   * @param word a long containing the bytes of the remaining values
+   * @param output array for unpacked output values
+   * @param outputOffset starting index to store values in the output array
+   * @param count number of values to unpack from the word
    */
   private static void unpackRemainder(
-      ByteBuffer data,
-      int dataPos,
-      int[] output,
-      int count,
-      int fullGroups,
-      int bitsPerValue,
-      int mask) {
-    int rem = count & 7;
-    if (rem > 0) {
-      int oBase = fullGroups << 3;
-      int dBase = dataPos + fullGroups * bitsPerValue;
-      int remBits = rem * bitsPerValue;
-      int remBytes = (remBits + 7) >>> 3;
-      long word = 0;
-      for (int k = 0; k < remBytes; k += 1) {
-        word = (word << 8) | (data.get(dBase + k) & 0xFF);
-      }
-      word >>>= (remBytes << 3) - remBits;
-      for (int k = rem - 1; k >= 0; k--) {
-        output[oBase + k] = (int) (word & mask);
-        word >>>= bitsPerValue;
-      }
+      int width, long word, int[] output, int outputOffset, int count) {
+    int mask = (1 << width) - 1;
+    // value bits are stored in the last bytes of the word
+    int valueBytes = byteWidth(count * width);
+    // the first value is width bits starting with the most-significant bits of the value bytes
+    int shift = 8 * valueBytes - width;
+    for (int i = 0; i < count; i += 1) {
+      output[outputOffset + i] = (int) ((word >>> shift) & mask);
+      shift -= width;
     }
   }
+
+  /**
+   * Read {@code count} bytes of data into the last {@code count} bytes of {@code word}.
+   *
+   * <p>The first input byte occupies the most significant bits of the last {@code count} bytes and
+   * the last input byte occupies the least significant bits of the word. The remaining most
+   * significant bytes of the word are 0.
+   */
+  private static long readWord(ByteBuffer data, int offset, int count) {
+    long word = 0;
+    for (int k = 0; k < count; k += 1) {
+      word = (word << 8) | (data.get(offset + k) & 0xFF);
+    }
+
+    return word;
+  }
+
+  /** Returns the number of whole bytes needed to hold {@code bits} bits. */
+  private static int byteWidth(int bits) {
+    return (bits + 7) / 8;
+  }
 }
diff --git a/core/src/main/java/org/apache/iceberg/mumbling/MumblingBitmap.java b/core/src/main/java/org/apache/iceberg/mumbling/MumblingBitmap.java
index 8758068e5624..74c5400acecd 100644
--- a/core/src/main/java/org/apache/iceberg/mumbling/MumblingBitmap.java
+++ b/core/src/main/java/org/apache/iceberg/mumbling/MumblingBitmap.java
@@ -19,6 +19,7 @@
 package org.apache.iceberg.mumbling;
 
 import java.nio.ByteBuffer;
+import org.apache.curator.shaded.com.google.common.base.Preconditions;
 
 /**
  * Read-only view of a Mumbling compressed bitmap stored in a {@link ByteBuffer}.
@@ -39,17 +40,32 @@
 class MumblingBitmap {
   private static final int VERSION = 1;
   private static final int HEADER_SIZE = 6;
-  private static final int DENSE_CONTAINER_BYTES = 32;
+  private static final int DENSE_CONTAINER_BIT = 0b0010_0000;
 
   private final ByteBuffer data;
-  private int[] offsets;
+  private final int cardinality;
+  private final int containerCount;
+  private int[] descriptors = null;
+  private int[] offsets = null;
 
   MumblingBitmap(ByteBuffer data) {
     int version = data.get(data.position()) & 0xFF;
     if (version != VERSION) {
-      throw new IllegalArgumentException("Unsupported Mumbling bitmap version: " + version);
+      throw new UnsupportedOperationException("Unsupported Mumbling bitmap version: " + version);
     }
+
     this.data = data;
+    this.cardinality =
+        (data.get(data.position() + 1) & 0xFF)
+            | ((data.get(data.position() + 2) & 0xFF) << 8)
+            | ((data.get(data.position() + 3) & 0xFF) << 16);
+    this.containerCount =
+        (data.get(data.position() + 4) & 0xFF) | ((data.get(data.position() + 5) & 0xFF) << 8);
+  }
+
+  /** Returns the number of bits set in the bitmap. */
+  public int cardinality() {
+    return cardinality;
   }
 
   /**
@@ -57,64 +73,97 @@ class MumblingBitmap {
    *
    * <p>Positions beyond the range of any container are always unset.
    */
-  boolean isSet(int pos) {
+  public boolean isSet(int pos) {
+    Preconditions.checkArgument(pos >= 0, "Invalid bit position: %s < 0", pos);
     int containerIndex = pos >>> 8;
     int posInContainer = pos & 0xFF;
 
-    int[] offs = ensureOffsets();
-    if (containerIndex >= offs.length - 1) {
+    if (containerIndex >= containerCount) {
       return false;
     }
 
-    int containerStart = offs[containerIndex];
-    int containerLength = offs[containerIndex + 1] - containerStart;
+    int containerStart = offset(containerIndex);
+    int descriptor = descriptor(containerIndex);
 
-    if (containerLength == DENSE_CONTAINER_BYTES) {
+    if (isDense(descriptor)) {
       // Dense: 32-byte bitset, MSB of byte 0 is position 0
-      int byteIdx = posInContainer >>> 3;
-      int bitIdx = 7 - (posInContainer & 7);
-      return ((data.get(containerStart + byteIdx) >>> bitIdx) & 1) == 1;
+      int byteIndex = posInContainer >>> 3;
+      int bitShift = 7 - (posInContainer & 0b111);
+      return ((data.get(containerStart + byteIndex) >>> bitShift) & 0b1) == 0b1;
+
     } else {
       // Sparse: sorted list of set positions; scan until found or exceeded
-      for (int i = 0; i < containerLength; i += 1) {
+      for (int i = 0; i < descriptor; i += 1) {
         int stored = data.get(containerStart + i) & 0xFF;
         if (stored == posInContainer) {
           return true;
         }
+
         if (stored > posInContainer) {
           return false;
         }
       }
+
       return false;
     }
   }
 
-  private int[] ensureOffsets() {
-    if (offsets == null) {
-      offsets = buildOffsets();
+  private int descriptor(int containerIndex) {
+    if (null == descriptors) {
+      decodeDescriptors();
     }
-    return offsets;
+
+    return descriptors[containerIndex];
+  }
+
+  private int offset(int containerIndex) {
+    if (null == offsets) {
+      decodeDescriptors();
+    }
+
+    return offsets[containerIndex];
   }
 
-  private int[] buildOffsets() {
-    int base = data.position();
+  /**
+   * Decode the descriptor array and produce an array of absolute container offsets in the buffer.
+   */
+  private void decodeDescriptors() {
+    this.descriptors = new int[containerCount];
+    int bytesRead =
+        PFOREncoding.decode(data, data.position() + HEADER_SIZE, descriptors, 0, containerCount);
 
-    // Container count: bytes 4–5, little-endian
-    int containerCount = (data.get(base + 4) & 0xFF) | ((data.get(base + 5) & 0xFF) << 8);
+    this.offsets = new int[containerCount + 1];
+    int firstContainerOffset = data.position() + HEADER_SIZE + bytesRead;
+    descriptorsToOffsets(firstContainerOffset, descriptors, offsets);
+  }
 
-    // Decode the PFOR descriptor array directly into a relative offset array, tracking bytes
-    // consumed so we know where the containers section starts
-    int[] offsets = new int[containerCount + 1];
-    ByteBuffer descriptorBuffer = data.duplicate();
-    descriptorBuffer.position(base + HEADER_SIZE);
-    int descriptorBytes = PFOREncoding.decodeOffsets(descriptorBuffer, containerCount, offsets);
+  private static boolean isDense(int descriptor) {
+    return (descriptor & DENSE_CONTAINER_BIT) == DENSE_CONTAINER_BIT;
+  }
 
-    // Adjust relative offsets to absolute positions in the buffer
-    int containersStart = base + HEADER_SIZE + descriptorBytes;
-    for (int i = 0; i <= containerCount; i += 1) {
-      offsets[i] += containersStart;
-    }
+  /**
+   * Convert an array of lengths into an array of offsets starting at 0.
+   *
+   * <p>For example, descriptorsToOffsets([1, 1, 2]) produces [0, 1, 2, 4].
+   *
+   * @param baseOffset initial offset of the first container
+   * @param descriptors an array of descriptor bytes
+   * @param offsets output array of offsets
+   */
+  private static void descriptorsToOffsets(int baseOffset, int[] descriptors, int[] offsets) {
+    Preconditions.checkArgument(
+        offsets.length > descriptors.length,
+        "Cannot decode %s lengths into %s offsets (not enough space)",
+        descriptors.length,
+        offsets.length);
 
-    return offsets;
+    offsets[0] = baseOffset;
+    for (int i = 0; i < descriptors.length; i += 1) {
+      if (isDense(descriptors[i])) {
+        offsets[i + 1] = offsets[i] + 32;
+      } else {
+        offsets[i + 1] = offsets[i] + descriptors[i];
+      }
+    }
   }
 }
diff --git a/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java b/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java
index d40e05bd8d7c..fad8915811a1 100644
--- a/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java
+++ b/core/src/main/java/org/apache/iceberg/mumbling/PFOREncoding.java
@@ -19,303 +19,339 @@
 package org.apache.iceberg.mumbling;
 
 import java.nio.ByteBuffer;
+import org.apache.curator.shaded.com.google.common.base.Preconditions;
 import org.apache.iceberg.util.Pair;
 
 /**
- * PFOR (Patched Frame of Reference) encoding for arrays of unsigned byte values.
+ * Patched Frame of Reference (PFOR) encoding for arrays of unsigned byte values.
  *
  * <p>Implements the encoding described in Appendix A of the Mumbling bitmap specification. The
  * input array is split into 256-value chunks (the last chunk may be shorter). Each chunk is
- * independently compressed.
+ * independently encoded using 4 configuration values:
+ *
+ * <ul>
+ *   <li>{@code b1}: number of bits stored in the primary array for every normalized value
+ *   <li>{@code b2}: number of bits stored per exception value (normalized value of &gt; b1 bits)
+ *   <li>{@code e}: number of exceptions with more than b1 bits
+ *   <li>{@code m}: chunk-local minimum value, subtracted from all values to normalize
+ * </ul>
  *
  * <p>Each chunk is stored as:
  *
  * <ul>
- *   <li>3-byte header: {@code b1|b2} (low/high nibbles of byte 0), {@code e} (byte 1), {@code m}
- *       (byte 2)
- *   <li>Primary array: {@code ceil(n * b1 / 8)} bytes — the low {@code b1} bits of every value,
- *       packed MSB-first
- *   <li>Exception offsets: {@code e} bytes — the chunk-relative position of each exception value
- *   <li>Exception values: {@code ceil(e * b2 / 8)} bytes — bits {@code [b1, b1+b2)} of each
- *       exception, packed MSB-first
+ *   <li>3-byte header: {@code b1|b2} primary and exception bit widths (byte 0), {@code e} exception
+ *       count (byte 1), {@code m} normalization base value (byte 2)
+ *   <li>Primary array: the low {@code b1} bits of every normalized value, packed MSB-first ({@code
+ *       b1 * n} bits, padded to a byte)
+ *   <li>Exception offsets: chunk-relative positions of exception values ({@code e} bytes)
+ *   <li>Exception values: the high {@code b2} bits of every exception value, packed MSB-first
+ *       ({@code e * b2} bits, padded to a byte.
  * </ul>
- *
- * <p>During encoding, the chunk minimum {@code m} is subtracted from every value. During decoding,
- * {@code m} is added back. When {@code b1 = 8}, no exceptions are produced and {@code m} is stored
- * as 0 (original values are written directly). Bit packing and unpacking is delegated to {@link
- * BitPacking}.
  */
 class PFOREncoding {
   private static final int CHUNK_SIZE = 256;
-  private static final int DENSE_DESCRIPTOR_BIT = 0x20;
-  private static final int SPARSE_LENGTH_MASK = 0x1F;
-  private static final int DENSE_CONTAINER_BYTES = 32;
 
   private PFOREncoding() {}
 
   /**
-   * Encodes an array of unsigned byte values (each in {@code [0, 255]}) using PFOR encoding.
+   * Encodes {@code count} values from an array of unsigned byte values.
    *
    * @param values unsigned byte values to encode
-   * @return a newly allocated buffer
+   * @param count number of values to encode
+   * @return a {@link ByteBuffer} of the encoded values with position and limit set for reading
    */
-  static ByteBuffer encode(int[] values) {
-    return encode(values, values.length, null);
+  static ByteBuffer encode(int[] values, int count) {
+    ByteBuffer out = ByteBuffer.allocate(estimateEncodedSize(count));
+    int bytesWritten = encode(values, 0, out, out.position(), count);
+    return out.slice(0, bytesWritten);
   }
 
   /**
-   * Encodes an array of unsigned byte values (each in {@code [0, 255]}) using PFOR encoding.
+   * Encode {@code count} unsigned byte values from {@code values} into a buffer.
    *
-   * <p>If {@code buffer} has sufficient capacity its backing storage is reused; otherwise a new
-   * buffer is allocated. {@code buffer}'s position and limit are never modified. The returned
-   * buffer is always a slice with position=0 and limit=encoded length.
+   * <p>The buffer's position and limit are not modified.
    *
    * @param values unsigned byte values to encode
-   * @param length number of values to encode
-   * @return a slice of {@code buffer} if capacity was sufficient, otherwise a slice of a newly
-   *     allocated buffer
+   * @param valueOffset starting offset of values to encode
+   * @param out buffer to write encoded values to
+   * @param outOffset starting offset in the output bufferß
+   * @param count number of values to encode
+   * @return the number of bytes written to the buffer
    */
-  static ByteBuffer encode(int[] values, int length) {
-    return encode(values, length, null);
-  }
-
-  /**
-   * Encodes the first {@code length} unsigned byte values (each in {@code [0, 255]}) from {@code
-   * values} using PFOR encoding.
-   *
-   * <p>If {@code buffer} has sufficient capacity its backing storage is reused; otherwise a new
-   * buffer is allocated. {@code buffer}'s position and limit are never modified. The returned
-   * buffer is always a slice with position=0 and limit=encoded length.
-   *
-   * @param values unsigned byte values to encode
-   * @param length number of values to encode
-   * @param buffer candidate buffer whose storage may be reused
-   * @return a slice of {@code buffer} if capacity was sufficient, otherwise a slice of a newly
-   *     allocated buffer
-   */
-  static ByteBuffer encode(int[] values, int length, ByteBuffer buffer) {
-    int numChunks = ceilDiv(length, CHUNK_SIZE);
-    // Worst-case per chunk is b1=8: 3-byte header + 1 byte per value. Any other b1 chosen by the
-    // encoder costs <= n bytes of data (otherwise b1=8 would have been selected instead).
-    int maxSize = 3 * numChunks + length;
-    ByteBuffer out =
-        buffer != null && buffer.capacity() >= maxSize ? buffer : ByteBuffer.allocate(maxSize);
-    int pos = 0;
-    int offset = 0;
-    while (offset < length) {
-      int chunkLength = Math.min(CHUNK_SIZE, length - offset);
-      pos += encodeChunk(values, offset, chunkLength, out, pos);
-      offset += chunkLength;
+  static int encode(int[] values, int valueOffset, ByteBuffer out, int outOffset, int count) {
+    // check the buffer's position and limit are compatible with outOffset and count
+    Preconditions.checkArgument(
+        outOffset >= out.position(),
+        "Cannot encode starting at %s to buffer with position %s",
+        outOffset,
+        out.position());
+    Preconditions.checkArgument(
+        estimateEncodedSize(count) <= out.limit() - outOffset,
+        "Cannot encode %s values to buffer with %s remaining space",
+        count,
+        out.remaining());
+
+    int bytesWritten = 0;
+    int currentOffset = valueOffset;
+
+    while (currentOffset < count) {
+      int chunkLength = Math.min(CHUNK_SIZE, count - currentOffset);
+      bytesWritten +=
+          encodeChunk(values, currentOffset, out, outOffset + bytesWritten, chunkLength);
+      currentOffset += chunkLength;
     }
 
-    return out.slice(0, pos);
+    return bytesWritten;
   }
 
   /**
-   * Decodes PFOR-encoded bytes back to unsigned byte values. Reads from {@code encoded.position()}
-   * using absolute indexing; the buffer's position is not modified.
+   * Decode to produce unsigned byte values.
+   *
+   * <p>Decodes starting at {@code encoded.position()} and does not modify the input buffer.
    *
    * @param encoded PFOR-encoded ByteBuffer produced by {@link #encode}
    * @param count total number of values to decode
    * @return decoded unsigned byte values
    */
   static int[] decode(ByteBuffer encoded, int count) {
-    if (count == 0) {
-      return new int[0];
-    }
-
-    int[] output = new int[count];
-    int pos = encoded.position();
-    int start = 0;
-    while (start < count) {
-      int length = Math.min(CHUNK_SIZE, count - start);
-      pos += decodeChunk(encoded, pos, output, start, length);
-      start += length;
-    }
-
-    return output;
+    int[] out = new int[count];
+    decode(encoded, encoded.position(), out, 0, count);
+    return out;
   }
 
   /**
-   * Encodes a container offset array as PFOR-encoded container lengths.
+   * Decode {@code count} unsigned bytes from a buffer into {@code out}.
    *
-   * <p>{@code offsets} must have {@code count + 1} entries. The length of container {@code i} is
-   * {@code offsets[i + 1] - offsets[i]} and is used as the descriptor value to encode.
+   * <p>This does not modify the input buffer.
    *
-   * @param offsets container offset array with {@code count + 1} entries
-   * @param count number of containers
-   * @return PFOR-encoded descriptor bytes
+   * @param encoded a buffer containing encoded data
+   * @param offset starting offset of encoded values
+   * @param out an output value array
+   * @param outOffset starting offset in the output array
+   * @param count number of values to decode
+   * @return the number of bytes read from the encoded buffer
    */
-  static ByteBuffer encodeOffsets(int[] offsets, int count) {
-    int[] lengths = new int[count];
-    for (int i = 0; i < count; i += 1) {
-      lengths[i] = offsets[i + 1] - offsets[i];
+  static int decode(ByteBuffer encoded, int offset, int[] out, int outOffset, int count) {
+    Preconditions.checkArgument(
+        offset >= encoded.position(),
+        "Cannot decode starting at %s from buffer with position %s",
+        offset,
+        encoded.position());
+
+    int bytesRead = 0;
+    int valuesRead = 0;
+
+    while (valuesRead < count) {
+      int chunkSize = Math.min(CHUNK_SIZE, count - valuesRead);
+      bytesRead += decodeChunk(encoded, offset + bytesRead, out, outOffset + valuesRead, chunkSize);
+      valuesRead += chunkSize;
     }
 
-    return encode(lengths, count);
+    return bytesRead;
   }
 
   /**
-   * Decodes PFOR-encoded descriptor bytes directly into a container offset array.
-   *
-   * <p>Reads from {@code encoded.position()} without modifying the buffer's position. Each decoded
-   * descriptor value is treated as a container length and accumulated into {@code offsets} as a
-   * prefix sum starting at 0. The caller is responsible for adjusting the resulting relative
-   * offsets to absolute positions.
+   * Encode one chunk into {@code out} starting at absolute position {@code outPos}.
    *
-   * <p>{@code offsets} must have {@code count + 1} entries. On return, {@code offsets[i]} is the
-   * cumulative byte length of containers {@code 0..i-1} and {@code offsets[count]} is the total
-   * byte length of all containers.
-   *
-   * @param encoded PFOR-encoded descriptor bytes
-   * @param count number of containers (descriptors) to decode
-   * @param offsets array to fill; must have length &gt;= count + 1
-   * @return number of bytes consumed from {@code encoded}
+   * @param values array containing source values to encode
+   * @param valueOffset starting index of values to encode
+   * @param out an output {@link ByteBuffer}
+   * @param outOffset starting index for output in the out buffer
+   * @param count number of values to encode
+   * @return the number of bytes written to the output buffer
    */
-  static int decodeOffsets(ByteBuffer encoded, int count, int[] offsets) {
-    if (count == 0) {
-      offsets[0] = 0;
-      return 0;
-    }
-
-    int[] chunk = new int[Math.min(CHUNK_SIZE, count)];
-    int pos = encoded.position();
-    int containerIdx = 0;
-    int cumulative = 0;
-    while (containerIdx < count) {
-      int chunkLen = Math.min(CHUNK_SIZE, count - containerIdx);
-      pos += decodeChunk(encoded, pos, chunk, 0, chunkLen);
-      for (int i = 0; i < chunkLen; i += 1) {
-        offsets[containerIdx + i] = cumulative;
-        int descriptor = chunk[i];
-        cumulative += (descriptor & DENSE_DESCRIPTOR_BIT) != 0 ? DENSE_CONTAINER_BYTES : (descriptor & SPARSE_LENGTH_MASK);
-      }
-
-      containerIdx += chunkLen;
-    }
-    offsets[count] = cumulative;
-
-    return pos - encoded.position();
-  }
-
-  /**
-   * Encodes one chunk into {@code out} starting at absolute position {@code outPos}. Returns the
-   * number of bytes written.
-   */
-  private static int encodeChunk(int[] values, int start, int length, ByteBuffer out, int outPos) {
-    // Step 1: find base=min(values) for normalization
-    int base = min(values, start, length);
-
-    // Step 2: normalize by subtracting base
-    int[] normalized = new int[length];
+  private static int encodeChunk(
+      int[] values, int valueOffset, ByteBuffer out, int outOffset, int count) {
+    Preconditions.checkArgument(count >= 0, "Invalid value count to encode: %s", count);
+    Preconditions.checkArgument(
+        valueOffset + count <= values.length,
+        "Cannot encode %s values starting at %s from int[%s]: not enough values",
+        count,
+        valueOffset,
+        values.length);
+
+    // find base=min(values) for normalization
+    int base = min(values, valueOffset, count);
+
+    // normalize by subtracting base
+    int[] normalized = new int[count];
     int setBits = 0;
-    for (int i = 0; i < length; i += 1) {
-      normalized[i] = values[start + i] - base;
-      setBits |= normalized[i];
+    int normalizedSetBits = 0;
+    for (int i = 0; i < count; i += 1) {
+      setBits |= values[valueOffset + i];
+      normalized[i] = values[valueOffset + i] - base;
+      normalizedSetBits |= normalized[i];
     }
 
-    // Step 3: find the maximum bit width needed for normalized values
-    int maxWidth = width(setBits);
+    Preconditions.checkArgument(
+        width(setBits) <= 8,
+        "Cannot encode values wider than 8 bits: %s bits needed",
+        width(setBits));
 
-    // Step 4: choose b1 to minimize total encoded data size (excluding 3-byte header)
-    Pair<Integer, Integer> widthAndExcCount = chooseBitWidth(normalized, length, maxWidth);
+    // Choose b1 to minimize total encoded data size (excluding 3-byte header)
+    int maxWidth = width(normalizedSetBits);
+    Pair<Integer, Integer> widthAndExcCount = chooseBitWidth(normalized, count, maxWidth);
     int b1 = widthAndExcCount.first();
     int b2 = maxWidth - b1;
     int excCount = widthAndExcCount.second();
-    int primaryBytes = ceilDiv(length * b1, 8);
-    int excValueBytes = ceilDiv(excCount * b2, 8);
 
-    // Special case: b1=8 means store original values as raw bytes with a constant header.
-    // b2, e, and m should be 0, so the header is always 0x08 0x00 0x00.
+    // check that there is enough space in the buffer for the encoded data
+    int requiredSize = encodedSize(count, b1, b2, excCount);
+    Preconditions.checkArgument(
+        outOffset + requiredSize <= out.remaining(),
+        "Cannot decode %s values from buffer with %s remaining bytes",
+        requiredSize,
+        out.remaining());
+
+    // Special case: b1=8 means store original values as raw bytes with b2, e, and m set to 0.
     if (b1 == 8) {
-      out.put(outPos, (byte) 0x08);
-      out.put(outPos + 1, (byte) 0);
-      out.put(outPos + 2, (byte) 0);
-      return copyBytes(values, start, length, out, outPos + 3) - outPos;
+      writeHeader(out, outOffset, b1, 0 /* b2 */, 0 /* excCount */, 0 /* m */);
+      return 3 + BitPacking.packBits(8, values, valueOffset, out, outOffset + 3, count);
     }
 
-    // Header: b1 in low nibble, b2 in high nibble, then e, then m
-
-    out.put(outPos, (byte) ((b2 << 4) | b1));
-    out.put(outPos + 1, (byte) excCount);
-    out.put(outPos + 2, (byte) base);
-    int pos = outPos + 3;
+    int bytesWritten = writeHeader(out, outOffset, b1, b2, excCount, base);
 
-    // Primary array: low b1 bits of every value, packed MSB-first
-    if (b1 > 0) {
-      BitPacking.packBits(normalized, 0, length, out, pos, b1);
-      pos += primaryBytes;
-    }
+    // Primary array: low b1 bits of every value
+    bytesWritten += BitPacking.packBits(b1, normalized, 0, out, outOffset + bytesWritten, count);
 
-    // b2 is the bit width of exception values: bits [b1, b1+b2) of each exception
-    if (maxWidth > b1) {
-      int[] exceptionOffsets = new int[length];
-      int[] exceptionValues = new int[length];
+    // b2 is the bit width of exception values: (maxWidth - b1) bits of each exception
+    if (excCount > 0) {
+      int[] excOffsets = new int[excCount];
+      int[] excValues = new int[excCount];
 
-      // Step 5: collect exceptions (values that do not fit in b1 bits)
+      // Collect exceptions (values that do not fit in b1 bits)
       int excIndex = 0;
       int threshold = 1 << b1;
-      for (int i = 0; i < length; i += 1) {
+      for (int i = 0; i < count; i += 1) {
         if (normalized[i] >= threshold) {
-          exceptionOffsets[excIndex] = i;
-          exceptionValues[excIndex] = normalized[i] >> b1;
+          excOffsets[excIndex] = i;
+          excValues[excIndex] = normalized[i] >>> b1;
           excIndex += 1;
         }
       }
 
       // Exception offsets (one byte per exception)
-      pos = copyBytes(exceptionOffsets, 0, excCount, out, pos);
-
-      // Exception values: bits [b1, b1+b2) of each exception, packed MSB-first
-      if (b2 > 0 && excCount > 0) {
-        if (b2 == 8) {
-          copyBytes(exceptionValues, 0, excCount, out, pos);
-        } else {
-          BitPacking.packBits(exceptionValues, 0, excCount, out, pos, b2);
-        }
-        pos += excValueBytes;
+      bytesWritten +=
+          BitPacking.packBits(8, excOffsets, 0, out, outOffset + bytesWritten, excCount);
+
+      // Exception values: remaining high b2 bits of each exception
+      bytesWritten +=
+          BitPacking.packBits(b2, excValues, 0, out, outOffset + bytesWritten, excCount);
+    }
+
+    return bytesWritten;
+  }
+
+  /**
+   * Decode one chunk of encoded data, writing decoded values into an output array.
+   *
+   * @param data buffer containing source data to decode
+   * @param dataOffset starting index in the buffer to decode
+   * @param out an output {@link ByteBuffer}
+   * @param outOffset starting index for output in the out buffer
+   * @param count number of values to decode
+   * @return the number of bytes read from {@code data}
+   */
+  private static int decodeChunk(
+      ByteBuffer data, int dataOffset, int[] out, int outOffset, int count) {
+    Preconditions.checkArgument(count >= 0, "Invalid value count to decode: %s", count);
+    Preconditions.checkArgument(
+        outOffset + count <= out.length,
+        "Cannot decode %s values starting at %s into int[%s]: not enough space",
+        count,
+        out.length,
+        outOffset);
+
+    int b1 = data.get(dataOffset) & 0x0F;
+    int b2 = (data.get(dataOffset) >>> 4) & 0x0F;
+    int excCount = data.get(dataOffset + 1) & 0xFF;
+    int base = data.get(dataOffset + 2) & 0xFF;
+    int bytesRead = 3;
+
+    // after reading the header, check that the full chunk is present
+    int expectedSize = encodedSize(count, b1, b2, excCount);
+    Preconditions.checkArgument(
+        dataOffset + expectedSize <= data.limit(),
+        "Cannot decode %s values from buffer with %s remaining bytes",
+        expectedSize,
+        data.limit() - dataOffset);
+
+    // Read primary array: low b1 bits of each value
+    bytesRead += BitPacking.unpackBits(b1, data, dataOffset + bytesRead, out, outOffset, count);
+
+    // Read exceptions and update output values
+    if (excCount > 0) {
+      int[] excOffsets = new int[excCount];
+      int[] excValues = new int[excCount];
+      int excListOffset = dataOffset + bytesRead;
+      int excDataOffset = dataOffset + bytesRead + excCount;
+
+      // Read exception indexes
+      bytesRead += BitPacking.unpackBits(8, data, excListOffset, excOffsets, 0, excCount);
+
+      // Read exception values and patch the primary values
+      bytesRead += BitPacking.unpackBits(b2, data, excDataOffset, excValues, 0, excCount);
+
+      // Update output values
+      for (int i = 0; i < excCount; i += 1) {
+        out[outOffset + excOffsets[i]] |= excValues[i] << b1;
       }
     }
 
-    return pos - outPos;
+    // Add back the chunk minimum
+    for (int i = 0; i < count; i += 1) {
+      out[outOffset + i] += base;
+    }
+
+    return bytesRead;
+  }
+
+  private static int writeHeader(
+      ByteBuffer out, int outOffset, int b1, int b2, int excCount, int base) {
+    // Header: b1 in low nibble, b2 in high nibble, then e, then m
+    out.put(outOffset, (byte) ((b2 << 4) | (b1 & 0b1111)));
+    out.put(outOffset + 1, (byte) excCount);
+    out.put(outOffset + 2, (byte) base);
+
+    return 3;
   }
 
   /**
-   * Chooses the primary bit width {@code b1} that minimizes total encoded chunk size.
+   * Choose the primary bit width {@code b1} that minimizes total encoded chunk size.
    *
-   * <p>For each candidate {@code b} from 0 to 8, computes:
+   * <p>This produces the width that results in the smallest total size and the number of exceptions
+   * for that width.
    *
-   * <ul>
-   *   <li>{@code e}: number of values needing more than {@code b} bits
-   *   <li>{@code b2 = maxWidth - b}: bits needed for exception remainders
-   *   <li>total size = {@code ceil(n * b / 8) + e + ceil(e * b2 / 8)}
-   * </ul>
+   * <p>Larger width is preferred on ties to reduce the number of exceptions.
    *
-   * Returns the {@code b} with minimum total size, preferring smaller {@code b} on ties.
+   * @param normalized value array to encode, after normalization
+   * @param length number of values in the array to encode
+   * @param maxWidth the largest bit width of normalized values
+   * @return a {@link Pair} of the chosen width and number of exceptions for that width
    */
   private static Pair<Integer, Integer> chooseBitWidth(int[] normalized, int length, int maxWidth) {
     int bestWidth = 0;
     int bestSize = Integer.MAX_VALUE;
     int bestExcCount = 0;
 
-    for (int b = 0; b <= maxWidth; b += 1) {
-      int e = 0;
-      if (b < 8) {
-        int threshold = 1 << b;
+    for (int candidateWidth = 0; candidateWidth <= maxWidth; candidateWidth += 1) {
+      int excCount = 0;
+      if (candidateWidth < 8) {
+        int threshold = 1 << candidateWidth;
         for (int i = 0; i < length; i += 1) {
           if (normalized[i] >= threshold) {
-            e += 1;
+            excCount += 1;
           }
         }
       }
 
-      int b2 = maxWidth - b;
-      int size = ceilDiv(length * b, 8) + e + ceilDiv(e * b2, 8);
+      int b2 = maxWidth - candidateWidth;
+      int size = byteWidth(length * candidateWidth) + excCount + byteWidth(excCount * b2);
 
-      if (size < bestSize) {
+      if (size <= bestSize) {
         bestSize = size;
-        bestWidth = b;
-        bestExcCount = e;
+        bestWidth = candidateWidth;
+        bestExcCount = excCount;
       }
     }
 
@@ -323,78 +359,18 @@ private static Pair<Integer, Integer> chooseBitWidth(int[] normalized, int lengt
   }
 
   /**
-   * Decodes one chunk of PFOR-encoded data, writing decoded values into {@code output[start,
-   * start+length)}.
+   * Return the lowest byte value from the array slice [start, start + length).
    *
-   * @return the number of bytes read from {@code data}
+   * <p>If length is < 1, the result will be larger than Byte.MAX_VALUE.
+   *
+   * @param values array of values
+   * @param start starting index
+   * @param length number of values to check
+   * @return the min of the values in the array slice
    */
-  private static int decodeChunk(ByteBuffer data, int pos, int[] output, int start, int length) {
-    int b1 = data.get(pos) & 0x0F;
-    int b2 = (data.get(pos) >> 4) & 0x0F;
-    int excCount = data.get(pos + 1) & 0xFF;
-    int base = data.get(pos + 2) & 0xFF;
-    int cursor = pos + 3;
-
-    // Special case: b1=8 means raw bytes; e is always 0
-    if (b1 == 8) {
-      for (int i = 0; i < length; i += 1) {
-        output[start + i] = (data.get(cursor + i) & 0xFF) + base;
-      }
-      return cursor + length - pos;
-    }
-
-    // Read primary array: low b1 bits of each value
-    int[] values = new int[length];
-    if (b1 > 0) {
-      BitPacking.unpackBits(data, cursor, values, length, b1);
-      cursor += ceilDiv(length * b1, 8);
-    }
-
-    // Read exception offsets
-    int[] offsets = new int[excCount];
-    for (int i = 0; i < excCount; i += 1) {
-      offsets[i] = data.get(cursor) & 0xFF;
-      cursor += 1;
-    }
-
-    // Read exception values and patch the primary values
-    if (b2 > 0 && excCount > 0) {
-      int[] excValues = new int[excCount];
-      if (b2 == 8) {
-        for (int i = 0; i < excCount; i += 1) {
-          excValues[i] = data.get(cursor + i) & 0xFF;
-        }
-      } else {
-        BitPacking.unpackBits(data, cursor, excValues, excCount, b2);
-      }
-      cursor += ceilDiv(excCount * b2, 8);
-
-      for (int i = 0; i < excCount; i += 1) {
-        values[offsets[i]] |= excValues[i] << b1;
-      }
-    }
-
-    // Add back the chunk minimum
-    for (int i = 0; i < length; i += 1) {
-      output[start + i] = values[i] + base;
-    }
-
-    return cursor - pos;
-  }
-
-  // ---------------------------------------------------------------------------
-  // Utilities
-  // ---------------------------------------------------------------------------
-
-  private static int copyBytes(int[] src, int srcStart, int count, ByteBuffer out, int outPos) {
-    for (int i = 0; i < count; i += 1) {
-      out.put(outPos + i, (byte) src[srcStart + i]);
-    }
-    return outPos + count;
-  }
-
   private static int min(int[] values, int start, int length) {
-    int min = 255;
+    // Use min > Byte.MAX_VALUE to signal no min (length < 1)
+    int min = 256;
     for (int i = start; i < start + length; i += 1) {
       if (values[i] < min) {
         min = values[i];
@@ -404,12 +380,29 @@ private static int min(int[] values, int start, int length) {
     return min;
   }
 
+  /** Returns the number of bytes required in the worst case to encode {@code valueCount} values. */
+  static int estimateEncodedSize(int valueCount) {
+    // Worst-case per chunk is b1=8: 3-byte header + 1 byte per value. Any other b1 chosen by the
+    // encoder costs <= n bytes of data (otherwise b1=8 would have been selected instead).
+    int numChunks = ceilDiv(valueCount, CHUNK_SIZE);
+    return 3 * numChunks + valueCount;
+  }
+
+  /** Returns the number of bytes required to encode a chunk of values. */
+  private static int encodedSize(int count, int b1, int b2, int excCount) {
+    return 3 + byteWidth(b1 * count) + excCount + byteWidth(b2 * excCount);
+  }
+
   /** Returns the number of bits required to represent {@code v} (0 for v=0). */
   static int width(int value) {
     return 32 - Integer.numberOfLeadingZeros(value);
   }
 
-  static int ceilDiv(int a, int b) {
+  private static int byteWidth(int bits) {
+    return ceilDiv(bits, 8);
+  }
+
+  private static int ceilDiv(int a, int b) {
     return (a + b - 1) / b;
   }
 }
diff --git a/core/src/test/java/org/apache/iceberg/mumbling/PFOREncodingTestUtils.java b/core/src/test/java/org/apache/iceberg/mumbling/PFOREncodingTestUtils.java
deleted file mode 100644
index db568ab107f3..000000000000
--- a/core/src/test/java/org/apache/iceberg/mumbling/PFOREncodingTestUtils.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.mumbling;
-
-import java.util.Random;
-
-/** Shared data-generation utilities for {@link PFOREncoding} tests and benchmarks. */
-class PFOREncodingTestUtils {
-
-  private PFOREncodingTestUtils() {}
-
-  /**
-   * Generates {@code count} values drawn uniformly from {@code [0, maxValue]} using the given seed.
-   */
-  static int[] uniform(int count, long seed, int maxValue) {
-    Random random = new Random(seed);
-    int[] values = new int[count];
-    for (int i = 0; i < count; i++) {
-      values[i] = random.nextInt(maxValue + 1);
-    }
-    return values;
-  }
-
-  /**
-   * Generates {@code count} values where each value is drawn from {@code [0, 3]} except that each
-   * position has a {@code exceptionPct}% chance of being replaced with a full-range value
-   * {@code [0, 255]}.
-   */
-  static int[] sparse(int count, long seed, int exceptionPct) {
-    Random random = new Random(seed);
-    int[] values = new int[count];
-    for (int i = 0; i < count; i++) {
-      values[i] = random.nextInt(100) < exceptionPct ? random.nextInt(256) : random.nextInt(4);
-    }
-    return values;
-  }
-
-  /**
-   * Generates {@code count} values drawn uniformly from {@code [minValue, minValue + range]} using
-   * the given seed.
-   */
-  static int[] withOffset(int count, long seed, int minValue, int range) {
-    Random random = new Random(seed);
-    int[] values = new int[count];
-    for (int i = 0; i < count; i++) {
-      values[i] = minValue + random.nextInt(range + 1);
-    }
-    return values;
-  }
-}
diff --git a/core/src/test/java/org/apache/iceberg/mumbling/PFORRandomData.java b/core/src/test/java/org/apache/iceberg/mumbling/PFORRandomData.java
new file mode 100644
index 000000000000..1ba375adb8bf
--- /dev/null
+++ b/core/src/test/java/org/apache/iceberg/mumbling/PFORRandomData.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.mumbling;
+
+import java.util.Random;
+
+/** Data generation for {@link PFOREncoding} tests and benchmarks. */
+class PFORRandomData {
+
+  private PFORRandomData() {}
+
+  /** Generates {@code count} values between 0 and {@code maxValue}. */
+  static int[] uniform(Random random, int count, int maxValue) {
+    int[] values = new int[count];
+    for (int i = 0; i < count; i++) {
+      values[i] = random.nextInt(maxValue + 1);
+    }
+
+    return values;
+  }
+
+  /** Generates {@code count} values (0-3) with about {@code excPercent}% exceptions (0-255). */
+  static int[] exceptions(Random random, int count, float excPercent) {
+    int[] values = new int[count];
+    for (int i = 0; i < count; i++) {
+      values[i] = random.nextFloat() < excPercent ? random.nextInt(256) : random.nextInt(4);
+    }
+
+    return values;
+  }
+}
diff --git a/core/src/test/java/org/apache/iceberg/mumbling/TestMumblingBitmap.java b/core/src/test/java/org/apache/iceberg/mumbling/TestMumblingBitmap.java
index 3745ced61f2e..31ac59551a87 100644
--- a/core/src/test/java/org/apache/iceberg/mumbling/TestMumblingBitmap.java
+++ b/core/src/test/java/org/apache/iceberg/mumbling/TestMumblingBitmap.java
@@ -19,41 +19,50 @@
 package org.apache.iceberg.mumbling;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
 
 import java.nio.ByteBuffer;
 import java.util.Arrays;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.junit.jupiter.api.Test;
 
 class TestMumblingBitmap {
 
-  // ---------------------------------------------------------------------------
-  // Empty bitmap (0 containers)
-  // ---------------------------------------------------------------------------
   @Test
   void testEmptyBitmap() {
-    MumblingBitmap bitmap = bitmap(new int[0]);
+    MumblingBitmap bitmap = bitmap();
+    assertThat(bitmap.cardinality()).isEqualTo(0);
+
+    // all positions beyond the bitmap range are false
     assertThat(bitmap.isSet(0)).isFalse();
     assertThat(bitmap.isSet(255)).isFalse();
     assertThat(bitmap.isSet(256)).isFalse();
   }
 
-  // ---------------------------------------------------------------------------
-  // Single empty sparse container (descriptor = 0, no container bytes)
-  // ---------------------------------------------------------------------------
+  @Test
+  void testInvalidPosition() {
+    MumblingBitmap bitmap = bitmap();
+    assertThat(bitmap.cardinality()).isEqualTo(0);
+    assertThat(bitmap.isSet(0)).isFalse();
+    assertThatThrownBy(() -> bitmap.isSet(-1))
+        .isInstanceOf(IllegalArgumentException.class)
+        .hasMessage("Invalid bit position: -1 < 0");
+  }
+
   @Test
   void testEmptySparseContainer() {
     MumblingBitmap bitmap = bitmap(sparse());
+    assertThat(bitmap.cardinality()).isEqualTo(0);
     assertThat(bitmap.isSet(0)).isFalse();
     assertThat(bitmap.isSet(100)).isFalse();
     assertThat(bitmap.isSet(255)).isFalse();
   }
 
-  // ---------------------------------------------------------------------------
-  // Single sparse container with specific positions
-  // ---------------------------------------------------------------------------
   @Test
   void testSparseContainerSetPositions() {
     MumblingBitmap bitmap = bitmap(sparse(0, 5, 100, 255));
+    assertThat(bitmap.cardinality()).isEqualTo(4);
+
     assertThat(bitmap.isSet(0)).isTrue();
     assertThat(bitmap.isSet(5)).isTrue();
     assertThat(bitmap.isSet(100)).isTrue();
@@ -65,56 +74,44 @@ void testSparseContainerSetPositions() {
     assertThat(bitmap.isSet(99)).isFalse();
     assertThat(bitmap.isSet(101)).isFalse();
     assertThat(bitmap.isSet(254)).isFalse();
+    assertThat(bitmap.isSet(256)).isFalse();
   }
 
-  // ---------------------------------------------------------------------------
-  // Full sparse container (31 positions, the maximum)
-  // ---------------------------------------------------------------------------
   @Test
-  void testMaxSparseContainer() {
+  void testFullSparseContainer() {
     int[] positions = new int[31];
     for (int i = 0; i < 31; i += 1) {
       positions[i] = i * 8; // 0, 8, 16, ..., 240
     }
+
     MumblingBitmap bitmap = bitmap(sparse(positions));
+    assertThat(bitmap.cardinality()).isEqualTo(31);
+
     for (int p : positions) {
       assertThat(bitmap.isSet(p)).isTrue();
     }
+
     assertThat(bitmap.isSet(1)).isFalse();
     assertThat(bitmap.isSet(7)).isFalse();
     assertThat(bitmap.isSet(255)).isFalse();
   }
 
-  // ---------------------------------------------------------------------------
-  // Dense container: all bits set (32 bytes of 0xFF)
-  // ---------------------------------------------------------------------------
   @Test
-  void testDenseContainerAllSet() {
+  void testFullDenseContainer() {
     byte[] container = new byte[32];
     Arrays.fill(container, (byte) 0xFF);
+
     MumblingBitmap bitmap = bitmap(dense(container));
+    assertThat(bitmap.cardinality()).isEqualTo(256);
+
     for (int i = 0; i < 256; i += 1) {
       assertThat(bitmap.isSet(i)).isTrue();
     }
-  }
 
-  // ---------------------------------------------------------------------------
-  // Dense container: no bits set (32 bytes of 0x00)
-  // ---------------------------------------------------------------------------
-  @Test
-  void testDenseContainerNoneSet() {
-    byte[] container = new byte[32];
-    MumblingBitmap bitmap = bitmap(dense(container));
-    for (int i = 0; i < 256; i += 1) {
-      assertThat(bitmap.isSet(i)).isFalse();
-    }
+    assertThat(bitmap.isSet(256)).isFalse();
   }
 
-  // ---------------------------------------------------------------------------
-  // Dense container spec examples
-  // ---------------------------------------------------------------------------
-
-  // `FF FF FF FF 00 ... 00` → positions 0-31
+  // Example 1: positions 0-31: `FF FF FF FF 00 ... 00`
   @Test
   void testDenseSpecExample1() {
     byte[] container = new byte[32];
@@ -123,15 +120,17 @@ void testDenseSpecExample1() {
     container[2] = (byte) 0xFF;
     container[3] = (byte) 0xFF;
     MumblingBitmap bitmap = bitmap(dense(container));
+    assertThat(bitmap.cardinality()).isEqualTo(32);
+
     for (int i = 0; i <= 31; i += 1) {
       assertThat(bitmap.isSet(i)).isTrue();
     }
+
     assertThat(bitmap.isSet(32)).isFalse();
     assertThat(bitmap.isSet(255)).isFalse();
   }
 
-  // `FF FF FF FF A0 00 ... 00` → positions 0-32
-  // 0xA0 = 10100000: MSB (pos 32) is set, pos 33 is not
+  // Example 2: positions 0-32: `FF FF FF FF 80 00 ... 00`
   @Test
   void testDenseSpecExample2() {
     byte[] container = new byte[32];
@@ -139,16 +138,20 @@ void testDenseSpecExample2() {
     container[1] = (byte) 0xFF;
     container[2] = (byte) 0xFF;
     container[3] = (byte) 0xFF;
-    container[4] = (byte) 0xA0;
+    container[4] = (byte) 0x80;
+
     MumblingBitmap bitmap = bitmap(dense(container));
+    assertThat(bitmap.cardinality()).isEqualTo(33);
+
     for (int i = 0; i <= 32; i += 1) {
       assertThat(bitmap.isSet(i)).isTrue();
     }
+
     assertThat(bitmap.isSet(33)).isFalse();
     assertThat(bitmap.isSet(255)).isFalse();
   }
 
-  // `FF FF 00 ... 00 FF FF` → positions 0-15, 240-255
+  // Example 3: positions 0-15 and 240-255: `FF FF 00 ... 00 FF FF`
   @Test
   void testDenseSpecExample3() {
     byte[] container = new byte[32];
@@ -156,7 +159,10 @@ void testDenseSpecExample3() {
     container[1] = (byte) 0xFF;
     container[30] = (byte) 0xFF;
     container[31] = (byte) 0xFF;
+
     MumblingBitmap bitmap = bitmap(dense(container));
+    assertThat(bitmap.cardinality()).isEqualTo(32);
+
     for (int i = 0; i <= 15; i += 1) {
       assertThat(bitmap.isSet(i)).isTrue();
     }
@@ -165,85 +171,67 @@ void testDenseSpecExample3() {
     }
     assertThat(bitmap.isSet(16)).isFalse();
     assertThat(bitmap.isSet(239)).isFalse();
+    assertThat(bitmap.isSet(256)).isFalse();
   }
 
-  // `AA AA ... AA AA` → even positions: 0, 2, 4, ...
-  // 0xAA = 10101010: MSB-first gives positions 0, 2, 4, 6 set per byte
+  // Example 4: even positions 0, 2, 4, ...: `AA AA ... AA AA`
   @Test
   void testDenseSpecExample4() {
     byte[] container = new byte[32];
     Arrays.fill(container, (byte) 0xAA);
+
     MumblingBitmap bitmap = bitmap(dense(container));
+    assertThat(bitmap.cardinality()).isEqualTo(128);
+
     for (int i = 0; i < 256; i += 1) {
       assertThat(bitmap.isSet(i)).isEqualTo(i % 2 == 0);
     }
+
+    assertThat(bitmap.isSet(256)).isFalse();
   }
 
-  // ---------------------------------------------------------------------------
-  // Multiple containers: positions in different containers
-  // Container 0 (pos 0–255): sparse {5}
-  // Container 1 (pos 256–511): sparse {10} → global position 266
-  // Container 2 (pos 512–767): empty sparse
-  // ---------------------------------------------------------------------------
   @Test
   void testMultipleContainers() {
-    MumblingBitmap bitmap =
-        bitmap(
-            new ContainerSpec[] {
-              sparse(5), sparse(10), sparse()
-            });
+    MumblingBitmap bitmap = bitmap(sparse(5), sparse(), sparse(10));
+    assertThat(bitmap.cardinality()).isEqualTo(2);
 
     assertThat(bitmap.isSet(5)).isTrue(); // container 0, pos 5
-    assertThat(bitmap.isSet(266)).isTrue(); // container 1, pos 10
-    assertThat(bitmap.isSet(512)).isFalse(); // container 2, empty
+    assertThat(bitmap.isSet(256)).isFalse(); // container 1
+    assertThat(bitmap.isSet(522)).isTrue(); // container 2, pos 10
+
+    assertThat(bitmap.isSet(512)).isFalse();
     assertThat(bitmap.isSet(4)).isFalse();
     assertThat(bitmap.isSet(265)).isFalse();
     assertThat(bitmap.isSet(267)).isFalse();
   }
 
-  // ---------------------------------------------------------------------------
-  // Mixed sparse and dense containers
-  // Container 0: dense with byte 0 = 0xFF (positions 0–7 set)
-  // Container 1: sparse {0} (global position 256)
-  // ---------------------------------------------------------------------------
   @Test
   void testMixedSparseAndDense() {
     byte[] denseContainer = new byte[32];
     denseContainer[0] = (byte) 0xFF;
-    MumblingBitmap bitmap =
-        bitmap(
-            new ContainerSpec[] {
-              dense(denseContainer), sparse(0)
-            });
+    denseContainer[1] = (byte) 0xFF;
+    denseContainer[2] = (byte) 0xFF;
+    denseContainer[3] = (byte) 0xFF;
+
+    MumblingBitmap bitmap = bitmap(dense(denseContainer), sparse(1));
+    assertThat(bitmap.cardinality()).isEqualTo(33);
 
-    for (int i = 0; i < 8; i += 1) {
+    for (int i = 0; i < 32; i += 1) {
       assertThat(bitmap.isSet(i)).isTrue();
     }
-    assertThat(bitmap.isSet(8)).isFalse();
-    assertThat(bitmap.isSet(256)).isTrue(); // container 1, pos 0
-    assertThat(bitmap.isSet(257)).isFalse();
-  }
+    assertThat(bitmap.isSet(32)).isFalse();
 
-  // ---------------------------------------------------------------------------
-  // Position beyond container count is always unset
-  // ---------------------------------------------------------------------------
-  @Test
-  void testPositionBeyondContainerCount() {
-    MumblingBitmap bitmap = bitmap(sparse(5));
-    // Only container 0 exists; container 1 (pos 256+) does not
     assertThat(bitmap.isSet(256)).isFalse();
-    assertThat(bitmap.isSet(511)).isFalse();
+    assertThat(bitmap.isSet(257)).isTrue(); // container 1, pos 1
+    assertThat(bitmap.isSet(258)).isFalse();
   }
 
-  // ---------------------------------------------------------------------------
-  // Buffer with non-zero initial position
-  // ---------------------------------------------------------------------------
   @Test
   void testBufferWithOffset() {
     // Prepend 4 bytes of garbage before the actual bitmap data
-    ByteBuffer raw = build(new ContainerSpec[] {sparse(42)});
-    byte[] rawBytes = new byte[raw.remaining()];
-    raw.get(rawBytes);
+    ByteBuffer buffer = build(sparse(42));
+    byte[] rawBytes = new byte[buffer.remaining()];
+    buffer.get(rawBytes);
 
     ByteBuffer padded = ByteBuffer.allocate(4 + rawBytes.length);
     padded.position(4);
@@ -251,102 +239,115 @@ void testBufferWithOffset() {
     padded.position(4); // position the buffer at the start of bitmap data
 
     MumblingBitmap bitmap = new MumblingBitmap(padded);
+    assertThat(bitmap.cardinality()).isEqualTo(1);
+
+    assertThat(bitmap.isSet(41)).isFalse();
     assertThat(bitmap.isSet(42)).isTrue();
     assertThat(bitmap.isSet(43)).isFalse();
   }
 
-  // ---------------------------------------------------------------------------
-  // Lazy init: calling isSet multiple times produces consistent results
-  // ---------------------------------------------------------------------------
-  @Test
-  void testLazyInitConsistency() {
-    MumblingBitmap bitmap = bitmap(sparse(1, 2, 3));
-    for (int trial = 0; trial < 3; trial += 1) {
-      assertThat(bitmap.isSet(1)).isTrue();
-      assertThat(bitmap.isSet(2)).isTrue();
-      assertThat(bitmap.isSet(3)).isTrue();
-      assertThat(bitmap.isSet(0)).isFalse();
-      assertThat(bitmap.isSet(4)).isFalse();
-    }
-  }
-
-  // ---------------------------------------------------------------------------
-  // Helpers
-  // ---------------------------------------------------------------------------
-
-  /** Descriptor + bytes for a sparse container. */
-  private static ContainerSpec sparse(int... positions) {
+  private static Container sparse(int... positions) {
     byte[] bytes = new byte[positions.length];
     for (int i = 0; i < positions.length; i += 1) {
+      if (i > 0) {
+        Preconditions.checkArgument(
+            positions[i] < 256, "Invalid position in container: %s", positions[i]);
+        Preconditions.checkArgument(
+            positions[i] > positions[i - 1],
+            "Invalid sparse container: pos %s=%s >= pos %s=%s",
+            i - 1,
+            positions[i - 1],
+            i,
+            positions[i]);
+      }
+
       bytes[i] = (byte) positions[i];
     }
-    return new ContainerSpec(positions.length, bytes);
+
+    return new Container(bytes);
   }
 
   /** Descriptor + bytes for a dense container. */
-  private static ContainerSpec dense(byte[] container) {
-    if (container.length != 32) {
-      throw new IllegalArgumentException("Dense container must be 32 bytes");
-    }
-    return new ContainerSpec(32, container);
+  private static Container dense(byte[] container) {
+    Preconditions.checkArgument(container.length == 32, "Dense container must be 32 bytes");
+    return new Container(container);
   }
 
-  private static class ContainerSpec {
-    final int descriptor;
-    final byte[] bytes;
+  private static class Container {
+    private final byte[] bytes;
+    private final int descriptor;
+    private final int cardinality;
 
-    ContainerSpec(int descriptor, byte[] bytes) {
-      this.descriptor = descriptor;
+    Container(byte[] bytes) {
       this.bytes = bytes;
+      this.descriptor = bytes.length;
+      this.cardinality = cardinality(bytes);
     }
-  }
 
-  /** Builds a bitmap with a single container. */
-  private static MumblingBitmap bitmap(ContainerSpec spec) {
-    return new MumblingBitmap(build(new ContainerSpec[] {spec}));
-  }
-
-  /** Builds a bitmap with no containers. */
-  private static MumblingBitmap bitmap(int[] ignored) {
-    return new MumblingBitmap(build(new ContainerSpec[0]));
+    private static int cardinality(byte[] bytes) {
+      if (bytes.length < 32) {
+        return bytes.length;
+      } else if (bytes.length == 32) {
+        int setBits = 0;
+        for (byte b : bytes) {
+          setBits += Integer.bitCount(b & 0xFF);
+        }
+
+        Preconditions.checkArgument(
+            setBits > 31, "Invalid dense container: %s values should be sparse", setBits);
+
+        return setBits;
+      } else {
+        throw new IllegalArgumentException("Invalid container: longer than 32 bytes");
+      }
+    }
   }
 
-  /** Builds a bitmap with multiple containers. */
-  private static MumblingBitmap bitmap(ContainerSpec[] specs) {
-    return new MumblingBitmap(build(specs));
+  private static MumblingBitmap bitmap(Container... containers) {
+    return new MumblingBitmap(build(containers));
   }
 
-  private static ByteBuffer build(ContainerSpec[] specs) {
-    int[] descriptors = new int[specs.length];
-    for (int i = 0; i < specs.length; i += 1) {
-      descriptors[i] = specs[i].descriptor;
+  private static ByteBuffer build(Container... containers) {
+    Preconditions.checkArgument(
+        containers.length <= 8192, "Invalid container count (max 8192): %s", containers.length);
+
+    int[] descriptors = new int[containers.length];
+    int cardinality = 0;
+    int sizeEstimate = 6;
+    for (int i = 0; i < containers.length; i += 1) {
+      descriptors[i] = containers[i].descriptor;
+      cardinality += containers[i].cardinality;
+      sizeEstimate += containers[i].bytes.length;
     }
 
-    ByteBuffer encodedDescriptors =
-        specs.length > 0 ? PFOREncoding.encode(descriptors, specs.length) : ByteBuffer.allocate(0);
-
-    int totalContainerBytes = 0;
-    for (ContainerSpec spec : specs) {
-      totalContainerBytes += spec.bytes.length;
+    Preconditions.checkArgument(
+        cardinality <= 2_097_152, "Invalid cardinality (max 2,097,152): %s", cardinality);
+
+    sizeEstimate += PFOREncoding.estimateEncodedSize(containers.length);
+    ByteBuffer buf = ByteBuffer.allocate(sizeEstimate);
+
+    // header: version (1 byte), cardinality (3 bytes LE), container count (2 bytes LE)
+    buf.put(0, (byte) 1);
+    buf.put(1, (byte) (cardinality & 0xFF));
+    buf.put(2, (byte) ((cardinality >>> 8) & 0xFF));
+    buf.put(3, (byte) ((cardinality >>> 16) & 0xFF));
+    buf.put(4, (byte) (containers.length & 0xFF));
+    buf.put(5, (byte) ((containers.length >>> 8) & 0xFF));
+
+    // write encoded descriptors
+    int descriptorArraySize =
+        PFOREncoding.encode(descriptors, 0, buf, buf.position() + 6, descriptors.length);
+
+    // copy container bytes into the array
+    int containerOffset = 6 + descriptorArraySize;
+    for (Container spec : containers) {
+      buf.put(containerOffset, spec.bytes);
+      containerOffset += spec.bytes.length;
     }
 
-    int totalSize = 6 + encodedDescriptors.remaining() + totalContainerBytes;
-    ByteBuffer buf = ByteBuffer.allocate(totalSize);
-
-    // Header: version (1 byte), cardinality (3 bytes LE), container count (2 bytes LE)
-    buf.put((byte) 1);
-    buf.put((byte) 0);
-    buf.put((byte) 0);
-    buf.put((byte) 0);
-    buf.put((byte) (specs.length & 0xFF));
-    buf.put((byte) ((specs.length >>> 8) & 0xFF));
-
-    buf.put(encodedDescriptors);
-    for (ContainerSpec spec : specs) {
-      buf.put(spec.bytes);
-    }
+    // the offset after the last container is the length
+    buf.limit(containerOffset);
 
-    buf.flip();
     return buf;
   }
 }
diff --git a/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncoding.java b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncoding.java
index e2c28aca160f..20193fbd23fd 100644
--- a/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncoding.java
+++ b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncoding.java
@@ -25,134 +25,82 @@
 import org.apache.iceberg.util.ByteBuffers;
 import org.junit.jupiter.api.Test;
 
-/**
- * Tests for PFOR encoding based on the examples in Appendix A of the Mumbling bitmap spec.
- *
- * <p>Note: examples 3 and 5 from the spec appear to contain typos in the encoded byte sequences.
- * The expected byte arrays used here are derived from the spec algorithm applied to the stated
- * decoded values, and the tests verify both the byte-level encoding and the roundtrip decode.
- *
- * <p>Example 3 spec header is {@code 80 00 00} but {@code e} must be 2 (two exceptions), so the
- * correct header is {@code 80 02 00}.
- *
- * <p>Example 5 spec bytes are {@code 42 01 06 19 01 A0}, but with {@code b1=2} the cost-minimizing
- * {@code b2 = maxWidth - b1 = 5 - 2 = 3} (not 4), producing header {@code 32 01 06}.
- */
 public class TestPFOREncoding {
 
-  // ---------------------------------------------------------------------------
-  // Spec example 1: 256 values, all = 0
-  // Encoding: 0 bits per value, m = 0, no exceptions
-  // Expected: 00 00 00
-  // ---------------------------------------------------------------------------
+  // Example 1: 256 values, b1=0, m=0, all=0: `00 00 00`
   @Test
   public void testExample1AllZeros() {
     int[] values = new int[256];
     Arrays.fill(values, 0);
 
-    ByteBuffer encoded = PFOREncoding.encode(values);
+    ByteBuffer encoded = PFOREncoding.encode(values, values.length);
     assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(bytes(0x00, 0x00, 0x00));
 
     int[] decoded = PFOREncoding.decode(encoded, 256);
     assertThat(decoded).isEqualTo(values);
   }
 
-  // ---------------------------------------------------------------------------
-  // Spec example 2: 51 values, all = 5
-  // Encoding: 0 bits per value, m = 5, no exceptions
-  // Expected: 00 00 05
-  // ---------------------------------------------------------------------------
+  // Example 2: 51 values, b1=0, m=5, all=5: `00 00 05`
   @Test
   public void testExample2AllFives() {
     int[] values = new int[51];
     Arrays.fill(values, 5);
 
-    ByteBuffer encoded = PFOREncoding.encode(values);
+    ByteBuffer encoded = PFOREncoding.encode(values, values.length);
     assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(bytes(0x00, 0x00, 0x05));
 
     int[] decoded = PFOREncoding.decode(encoded, 51);
     assertThat(decoded).isEqualTo(values);
   }
 
-  // ---------------------------------------------------------------------------
-  // Spec example 3: [0, 0, 0, 0, 0xFF, 0, 0, 0xFE]
-  // Encoding: b1=0, b2=8, m=0, 2 exceptions at positions 4 and 7
-  //
-  // Note: the spec shows header bytes "80 00 00" but e=0 is incorrect; there are
-  // 2 exceptions so the correct encoding is "80 02 00 04 07 FF FE".
-  // ---------------------------------------------------------------------------
+  // Example 3: only exception values, b1=0, b2=8: `80 02 00 04 07 FF FE`
   @Test
   public void testExample3SparseExceptions() {
     int[] values = {0, 0, 0, 0, 0xFF, 0, 0, 0xFE};
 
-    // b1=0, b2=8, e=2, m=0  →  header byte0=(8<<4)|0=0x80, byte1=0x02, byte2=0x00
-    // primary: empty (b1=0)
-    // offsets: 0x04, 0x07
-    // exception values: 0xFF, 0xFE (8 bits each)
     byte[] expected =
         bytes(0x80, 0x02, 0x00, /* offsets */ 0x04, 0x07, /* exceptions */ 0xFF, 0xFE);
 
-    ByteBuffer encoded = PFOREncoding.encode(values);
+    ByteBuffer encoded = PFOREncoding.encode(values, values.length);
     assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(expected);
 
     int[] decoded = PFOREncoding.decode(encoded, values.length);
     assertThat(decoded).isEqualTo(values);
   }
 
-  // ---------------------------------------------------------------------------
-  // Spec example 4: [6, 7, 8]
-  // Encoding: b1=2, b2=0, m=6, no exceptions
-  // Expected: 02 00 06 18
-  // ---------------------------------------------------------------------------
+  // Example 4: [6, 7, 8], no exceptions, b1=2, m=6: `02 00 06 18`
   @Test
   public void testExample4TwoBitsNoExceptions() {
     int[] values = {6, 7, 8};
 
-    // b1=2, b2=0, e=0, m=6  →  header byte0=(0<<4)|2=0x02, byte1=0x00, byte2=0x06
-    // normalized: [0, 1, 2]
-    // primary: 00 01 10 (padded) = 0b00011000 = 0x18
     byte[] expected = bytes(0x02, 0x00, 0x06, 0x18);
 
-    ByteBuffer encoded = PFOREncoding.encode(values);
+    ByteBuffer encoded = PFOREncoding.encode(values, values.length);
     assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(expected);
 
     int[] decoded = PFOREncoding.decode(encoded, values.length);
     assertThat(decoded).isEqualTo(values);
   }
 
-  // ---------------------------------------------------------------------------
-  // Spec example 5: [6, 34, 8, 7]
-  // Encoding: b1=2, b2=3, m=6, 1 exception (value 34 → normalized 28)
-  //
-  // Note: the spec shows "42 01 06 19 01 A0" but that byte sequence is inconsistent
-  // with the stated decoded values. With b1=2 and maxWidth=5, the cost-minimizing
-  // b2 = maxWidth - b1 = 3 (not 4). The correct encoding is:
-  //   header:    32 01 06  (b1=2, b2=3, e=1, m=6)
-  //   primary:   09        ([0,0,2,1] low 2 bits packed MSB-first)
-  //   offset:    01        (exception at position 1)
-  //   exception: E0        (7 = 28>>2, packed as 3 bits: 111_00000)
-  // ---------------------------------------------------------------------------
+  // Example 5: [6, 34, 8, 7], b1=2, b2=3, m=6, 1 exception: `32 01 06 09 01 E0`
   @Test
-  public void testExample5TwoBitsOneException() {
+  public void testExample5() {
     int[] values = {6, 34, 8, 7};
 
-    // b1=2, b2=3, e=1, m=6  →  header byte0=(3<<4)|2=0x32, byte1=0x01, byte2=0x06
-    // normalized: [0, 28, 2, 1]
-    // primary:    [00, 00, 10, 01] packed MSB-first = 0b00001001 = 0x09
-    // exception offset: 0x01  (position 1)
-    // exception value:  28>>2 = 7 = 0b111, 3 bits MSB-first = 0b111_00000 = 0xE0
-    byte[] expected = bytes(0x32, 0x01, 0x06, 0x09, 0x01, 0xE0);
+    // impl prefers larger widths when the storage is the same size
+    byte[] expected = bytes(0x05, 0x00, 0x06, 0x07, 0x04, 0x10);
+    byte[] fromSpec = bytes(0x32, 0x01, 0x06, 0x09, 0x01, 0xE0);
 
-    ByteBuffer encoded = PFOREncoding.encode(values);
+    ByteBuffer encoded = PFOREncoding.encode(values, values.length);
     assertThat(ByteBuffers.toByteArray(encoded)).isEqualTo(expected);
 
     int[] decoded = PFOREncoding.decode(encoded, values.length);
     assertThat(decoded).isEqualTo(values);
+
+    int[] decodedFromSpec = PFOREncoding.decode(ByteBuffer.wrap(fromSpec), values.length);
+    assertThat(decodedFromSpec).isEqualTo(values);
   }
 
-  // ---------------------------------------------------------------------------
-  // Helper to convert varargs ints to a byte array (treating each int as a byte)
-  // ---------------------------------------------------------------------------
   private static byte[] bytes(int... values) {
     byte[] result = new byte[values.length];
     for (int i = 0; i < values.length; i++) {
diff --git a/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java
index fb6cdf1bd021..3b001415eba5 100644
--- a/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java
+++ b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java
@@ -21,148 +21,108 @@
 import static org.assertj.core.api.Assertions.assertThat;
 
 import java.nio.ByteBuffer;
+import java.util.Random;
 import java.util.stream.Stream;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;
 
 /**
- * Randomized roundtrip tests for {@link PFOREncoding}.
- *
- * <p>Each test case embeds a fixed seed so that failures are reproducible. The cases cover:
- *
- * <ul>
- *   <li><b>Uniform distributions</b> at several value ranges to exercise different primary bit
- *       widths ({@code b1}): 0-1 → b1=1, 0-3 → b1=2, 0-31 → b1=5, 0-255 → b1=8.
- *   <li><b>Size boundaries</b> around 8 (the pack/unpack group size) and 256 (the chunk size), to
- *       exercise the remainder logic and multi-chunk splitting.
- *   <li><b>Sparse distributions</b> — mostly-uniform base values with a small fraction of outliers
- *       — to exercise the exception encoding path, where a large {@code b2} is needed.
- *   <li><b>Non-zero minimum</b> values, to exercise the {@code m} subtraction.
- * </ul>
+ * Randomized round-trip tests for {@link PFOREncoding}.
  */
 public class TestPFOREncodingRandom {
+  private static final Random RANDOM = new Random(3546521684L);
 
-  // ---------------------------------------------------------------------------
-  // Uniform distributions at different value ranges
-  // ---------------------------------------------------------------------------
-
-  /**
-   * Each row: (description, count, seed, maxValue). Values are drawn uniformly from [0, maxValue].
-   * The choice of maxValue determines which primary bit width {@code b1} the encoder will select.
-   */
-  static Stream<Arguments> uniformCases() {
+  static Stream<Arguments> uniformRandomCases() {
     return Stream.of(
         // 1-bit range (b1=1): values in {0, 1}
-        Arguments.of("1-bit range, size=1",    1,   0x1a2b3c4dL, 1),
-        Arguments.of("1-bit range, size=7",    7,   0x5e6f7a8bL, 1),
-        Arguments.of("1-bit range, size=8",    8,   0xdeadbeefL, 1),
-        Arguments.of("1-bit range, size=9",    9,   0xcafebabeL, 1),
-        Arguments.of("1-bit range, size=256",  256, 0xf00dfa11L, 1),
+        Arguments.of("1-bit range, count=1", 1, 1),
+        Arguments.of("1-bit range, count=7", 7, 1),
+        Arguments.of("1-bit range, count=8", 8, 1),
+        Arguments.of("1-bit range, count=9", 9, 1),
+        Arguments.of("1-bit range, count=256", 256, 1),
 
         // 2-bit range (b1=2): values in [0, 3]
-        Arguments.of("2-bit range, size=7",    7,   0x1337c0deL, 3),
-        Arguments.of("2-bit range, size=8",    8,   0xa5a5a5a5L, 3),
-        Arguments.of("2-bit range, size=9",    9,   0x0badf00dL, 3),
-        Arguments.of("2-bit range, size=100",  100, 0x12345678L, 3),
-        Arguments.of("2-bit range, size=256",  256, 0x87654321L, 3),
+        Arguments.of("2-bit range, count=7", 7, 3),
+        Arguments.of("2-bit range, count=8", 8, 3),
+        Arguments.of("2-bit range, count=9", 9, 3),
+        Arguments.of("2-bit range, count=100", 100, 3),
+        Arguments.of("2-bit range, count=256", 256, 3),
 
         // 3-bit range (b1=3): values in [0, 7]
-        Arguments.of("3-bit range, size=7",    7,   0xabcdef01L, 7),
-        Arguments.of("3-bit range, size=24",   24,  0x01fedcbaL, 7),
-        Arguments.of("3-bit range, size=256",  256, 0xbeefcafeL, 7),
+        Arguments.of("3-bit range, count=7", 7, 7),
+        Arguments.of("3-bit range, count=24", 24, 7),
+        Arguments.of("3-bit range, count=256", 256, 7),
 
-        // 5-bit range (b1=5): values in [0, 31] — typical for Mumbling descriptor bytes
-        Arguments.of("5-bit range, size=7",    7,   0x11223344L, 31),
-        Arguments.of("5-bit range, size=8",    8,   0x44332211L, 31),
-        Arguments.of("5-bit range, size=63",   63,  0xaabbccddL, 31),
-        Arguments.of("5-bit range, size=64",   64,  0xddccbbaaL, 31),
-        Arguments.of("5-bit range, size=256",  256, 0x55667788L, 31),
+        // 6-bit range (b1=6): values in [0, 32] for Mumbling descriptor bytes
+        Arguments.of("5-bit range, count=7", 7, 32),
+        Arguments.of("5-bit range, count=8", 8, 32),
+        Arguments.of("5-bit range, count=63", 63, 32),
+        Arguments.of("5-bit range, count=64", 64, 32),
+        Arguments.of("5-bit range, count=256", 256, 32),
 
         // Full byte range (b1=8 or b1 chosen by cost): values in [0, 255]
-        Arguments.of("full range, size=1",     1,   0x99aabbccL, 255),
-        Arguments.of("full range, size=7",     7,   0xcc998877L, 255),
-        Arguments.of("full range, size=8",     8,   0x13572468L, 255),
-        Arguments.of("full range, size=9",     9,   0x24681357L, 255),
-        Arguments.of("full range, size=15",    15,  0xfedcba98L, 255),
-        Arguments.of("full range, size=16",    16,  0x89abcdefL, 255),
-        Arguments.of("full range, size=100",   100, 0x5a5a5a5aL, 255),
-        Arguments.of("full range, size=255",   255, 0xa1b2c3d4L, 255),
-        Arguments.of("full range, size=256",   256, 0xd4c3b2a1L, 255),
-        Arguments.of("full range, size=257",   257, 0x1f2e3d4cL, 255),
-        Arguments.of("full range, size=512",   512, 0x4c3d2e1fL, 255),
-        Arguments.of("full range, size=513",   513, 0xface0ffL,  255));
+        Arguments.of("full range, count=1", 1, 255),
+        Arguments.of("full range, count=7", 7, 255),
+        Arguments.of("full range, count=8", 8, 255),
+        Arguments.of("full range, count=9", 9, 255),
+        Arguments.of("full range, count=15", 15, 255),
+        Arguments.of("full range, count=16", 16, 255),
+        Arguments.of("full range, count=100", 100, 255),
+        Arguments.of("full range, count=255", 255, 255),
+        Arguments.of("full range, count=256", 256, 255),
+        Arguments.of("full range, count=257", 257, 255),
+        Arguments.of("full range, count=512", 512, 255),
+        Arguments.of("full range, count=513", 513, 255));
   }
 
   @ParameterizedTest(name = "{0}")
-  @MethodSource("uniformCases")
-  public void testUniformRandom(String name, int count, long seed, int maxValue) {
-    assertRoundtrip(PFOREncodingTestUtils.uniform(count, seed, maxValue));
+  @MethodSource("uniformRandomCases")
+  public void testUniformRandom(String name, int count, int maxValue) {
+    assertRoundTrip(PFORRandomData.uniform(RANDOM, count, maxValue));
   }
 
-  // ---------------------------------------------------------------------------
-  // Sparse distributions — tests exception encoding
-  //
-  // Most values are drawn from a narrow base range (0-3), but a fraction are
-  // replaced with full-range values (0-255). This creates chunks where b1 is
-  // small but e > 0, which exercises the exception path with b2 > 0.
-  // ---------------------------------------------------------------------------
-
-  /**
-   * Each row: (description, count, seed, exceptionProbabilityPct). Values are 0-3 except that each
-   * position has a {@code exceptionProbabilityPct}% chance of becoming a full-range outlier
-   * (0-255).
-   */
-  static Stream<Arguments> sparseCases() {
+  static Stream<Arguments> exceptionCases() {
     return Stream.of(
-        Arguments.of("sparse 10%, size=7",    7,   0x2a3b4c5dL, 10),
-        Arguments.of("sparse 10%, size=8",    8,   0x3c4d5e6fL, 10),
-        Arguments.of("sparse 10%, size=9",    9,   0x4e5f607aL, 10),
-        Arguments.of("sparse 10%, size=100",  100, 0x6b7c8d9eL, 10),
-        Arguments.of("sparse 10%, size=256",  256, 0x8d9eafb0L, 10),
-        Arguments.of("sparse 25%, size=256",  256, 0xc1d2e3f4L, 25),
-        Arguments.of("sparse 50%, size=256",  256, 0xf4e3d2c1L, 50),
-        Arguments.of("sparse 10%, size=512",  512, 0x10293847L, 10));
+        Arguments.of("Exception 10%, count=7", 7, 0.10f),
+        Arguments.of("Exception 10%, count=8", 8, 0.10f),
+        Arguments.of("Exception 10%, count=9", 9, 0.10f),
+        Arguments.of("Exception 10%, count=100", 100, 0.10f),
+        Arguments.of("Exception 10%, count=256", 256, 0.10f),
+        Arguments.of("Exception 25%, count=256", 256, 0.25f),
+        Arguments.of("Exception 50%, count=256", 256, 0.50f),
+        Arguments.of("Exception 10%, count=512", 512, 0.10f));
   }
 
   @ParameterizedTest(name = "{0}")
-  @MethodSource("sparseCases")
-  public void testSparseRandom(String name, int count, long seed, int exceptionPct) {
-    assertRoundtrip(PFOREncodingTestUtils.sparse(count, seed, exceptionPct));
+  @MethodSource("exceptionCases")
+  public void testRandomWithExceptions(String name, int count, float excPercent) {
+    assertRoundTrip(PFORRandomData.exceptions(RANDOM, count, excPercent));
   }
 
-  // ---------------------------------------------------------------------------
-  // Non-zero minimum — tests the m (minimum subtraction) mechanism
-  //
-  // Values are drawn from a range that does not include 0, so m > 0 and the
-  // encoder must subtract it before packing. After decoding, m is added back.
-  // ---------------------------------------------------------------------------
-
-  /**
-   * Each row: (description, count, seed, minValue, range). Values are drawn uniformly from
-   * [minValue, minValue + range].
-   */
-  static Stream<Arguments> offsetCases() {
+  static Stream<Arguments> offsetRangeCases() {
     return Stream.of(
-        Arguments.of("offset m=100 range=3, size=8",   8,   0x31415926L, 100, 3),
-        Arguments.of("offset m=100 range=3, size=256", 256, 0x27182818L, 100, 3),
-        Arguments.of("offset m=50  range=31, size=64", 64,  0x16180339L,  50, 31),
-        Arguments.of("offset m=200 range=55, size=100",100, 0x14142135L, 200, 55),
-        Arguments.of("offset m=128 range=127, size=256",256,0x17320508L, 128, 127));
+        Arguments.of("offset min=100 rangeSize=3, count=8", 8, 100, 3),
+        Arguments.of("offset min=100 rangeSize=3, count=256", 256, 100, 3),
+        Arguments.of("offset min=50  rangeSize=31, count=64", 64, 50, 31),
+        Arguments.of("offset min=200 rangeSize=55, count=100", 100, 200, 55),
+        Arguments.of("offset min=128 rangeSize=127, count=256", 256, 128, 127));
   }
 
   @ParameterizedTest(name = "{0}")
-  @MethodSource("offsetCases")
-  public void testOffsetRandom(String name, int count, long seed, int minValue, int range) {
-    assertRoundtrip(PFOREncodingTestUtils.withOffset(count, seed, minValue, range));
+  @MethodSource("offsetRangeCases")
+  public void testOffsetRandom(String name, int count, int minValue, int rangeSize) {
+    // generate values between 0 and rangeSize, then add the min value offset
+    int[] values = PFORRandomData.uniform(RANDOM, count, rangeSize);
+    for (int i = 0; i < values.length; i += 1) {
+      values[i] += minValue;
+    }
+
+    assertRoundTrip(values);
   }
 
-  // ---------------------------------------------------------------------------
-  // Helper
-  // ---------------------------------------------------------------------------
-
-  private static void assertRoundtrip(int[] values) {
-    ByteBuffer encoded = PFOREncoding.encode(values);
+  private static void assertRoundTrip(int[] values) {
+    ByteBuffer encoded = PFOREncoding.encode(values, values.length);
     int[] decoded = PFOREncoding.decode(encoded, values.length);
     assertThat(decoded).isEqualTo(values);
   }

From 224ff180e87d8054cb7c1dc28afe0581d7d42889 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Thu, 11 Jun 2026 13:34:20 -0700
Subject: [PATCH 4/5] Apply spotless.

---
 .../org/apache/iceberg/mumbling/TestPFOREncodingRandom.java   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java
index 3b001415eba5..5f36ab5ce99e 100644
--- a/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java
+++ b/core/src/test/java/org/apache/iceberg/mumbling/TestPFOREncodingRandom.java
@@ -27,9 +27,7 @@
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;
 
-/**
- * Randomized round-trip tests for {@link PFOREncoding}.
- */
+/** Randomized round-trip tests for {@link PFOREncoding}. */
 public class TestPFOREncodingRandom {
   private static final Random RANDOM = new Random(3546521684L);
 

From 6d18a863b8f864741f5c16f26ca631bc6061f618 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Thu, 11 Jun 2026 13:34:53 -0700
Subject: [PATCH 5/5] Fix 5%.

---
 .../java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java b/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java
index cbb5d69e85f8..418a032f5e75 100644
--- a/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java
+++ b/core/src/jmh/java/org/apache/iceberg/mumbling/PFOREncodingBenchmark.java
@@ -89,7 +89,7 @@ public void setupBenchmark() {
     Random random = new Random(1938745);
 
     // 256-value descriptor-like data: mostly [0,31] with ~5% [0,255] outliers
-    descriptorValues = PFORRandomData.exceptions(random, 256, 0.5f);
+    descriptorValues = PFORRandomData.exceptions(random, 256, 0.05f);
 
     // 256-value uniform byte data
     uniformValues = PFORRandomData.uniform(random, 256, 255);