diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java index e37ee12483..5c7600a43f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java @@ -37,6 +37,18 @@ public abstract class Binary implements Comparable, Serializable { protected boolean isBackingBytesReused; + /** + * Cached hash code for non-reused (immutable) Binary instances. + *

The sentinel value {@code 0} means "not yet computed". This follows the + * {@link String#hashCode()} idiom: races between concurrent first calls are benign + * because the computation is deterministic, and a hash that genuinely equals {@code 0} + * will simply be recomputed on every call (acceptably rare). Reused instances never + * cache (their backing bytes can mutate after construction). + *

Package-private (rather than private) so subclasses can read it directly without + * an extra method call on the {@link #hashCode()} hot path. + */ + transient int cachedHashCode; + // this isn't really something others should extend private Binary() {} @@ -101,6 +113,18 @@ public boolean equals(Object obj) { return false; } + /** + * Caches {@code hashCode} for non-reused instances and returns it. The cache uses + * a single int field with sentinel {@code 0} to remain race-safe without volatile. + * If the computed hash is {@code 0}, no caching occurs and the next call recomputes. + */ + final int cacheHashCode(int hashCode) { + if (!isBackingBytesReused) { + cachedHashCode = hashCode; + } + return hashCode; + } + @Override public String toString() { return "Binary{" + length() @@ -180,7 +204,11 @@ public Binary slice(int start, int length) { @Override public int hashCode() { - return Binary.hashCode(value, offset, length); + int h = cachedHashCode; + if (h != 0) { + return h; + } + return cacheHashCode(Binary.hashCode(value, offset, length)); } @Override @@ -340,7 +368,11 @@ public Binary slice(int start, int length) { @Override public int hashCode() { - return Binary.hashCode(value, 0, value.length); + int h = cachedHashCode; + if (h != 0) { + return h; + } + return cacheHashCode(Binary.hashCode(value, 0, value.length)); } @Override @@ -499,11 +531,18 @@ public Binary slice(int start, int length) { @Override public int hashCode() { + int h = cachedHashCode; + if (h != 0) { + return h; + } + + int computedHashCode; if (value.hasArray()) { - return Binary.hashCode(value.array(), value.arrayOffset() + offset, length); + computedHashCode = Binary.hashCode(value.array(), value.arrayOffset() + offset, length); } else { - return Binary.hashCode(value, offset, length); + computedHashCode = Binary.hashCode(value, offset, length); } + return cacheHashCode(computedHashCode); } @Override diff --git a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java index a1a83af771..19085b2244 100644 --- a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java +++ b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java @@ -155,6 +155,51 @@ public void testEqualityMethods() throws Exception { assertEquals(bin1, bin2); } + /** + * Verifies that {@link Binary#hashCode()} is cached for non-reused (constant) instances: + * the value returned must be stable, equal across the three concrete Binary + * implementations for the same bytes, and consistent with {@link Object#equals(Object)}. + */ + @Test + public void testHashCodeCachedForConstantBinary() { + byte[] bytes = "hash-cache-test".getBytes(); + + Binary[] constants = { + Binary.fromConstantByteArray(bytes), + Binary.fromConstantByteArray(bytes, 0, bytes.length), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(bytes)), + }; + int reference = constants[0].hashCode(); + for (Binary b : constants) { + int first = b.hashCode(); + int second = b.hashCode(); + assertEquals("repeated hashCode for " + b.getClass().getSimpleName(), first, second); + assertEquals( + "cross-impl hashCode for " + b.getClass().getSimpleName() + " must equal reference", + reference, + first); + } + } + + /** + * Verifies that reused (mutable backing) Binary instances do not return a stale cached + * hash code when their backing bytes change between calls. + */ + @Test + public void testHashCodeNotCachedForReusedBinary() { + byte[] bytes = "first".getBytes(); + Binary reused = Binary.fromReusedByteArray(bytes); + int firstHash = reused.hashCode(); + int constHashFirst = Binary.fromConstantByteArray(bytes).hashCode(); + assertEquals(constHashFirst, firstHash); + + byte[] mutated = "second-value".getBytes(); + reused = Binary.fromReusedByteArray(mutated); + int secondHash = reused.hashCode(); + int constHashSecond = Binary.fromConstantByteArray(mutated).hashCode(); + assertEquals(constHashSecond, secondHash); + } + @Test public void testWriteAllTo() throws Exception { byte[] orig = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1};