Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 43 additions & 4 deletions parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,18 @@ public abstract class Binary implements Comparable<Binary>, Serializable {

protected boolean isBackingBytesReused;

/**
* Cached hash code for non-reused (immutable) Binary instances.
* <p>The sentinel value {@code 0} means "not yet computed". This follows the
* {@link String#hashCode()} idiom: races between concurrent first calls are benign
* because the computation is deterministic, and a hash that genuinely equals {@code 0}
* will simply be recomputed on every call (acceptably rare). Reused instances never
* cache (their backing bytes can mutate after construction).
* <p>Package-private (rather than private) so subclasses can read it directly without
* an extra method call on the {@link #hashCode()} hot path.
*/
transient int cachedHashCode;

// this isn't really something others should extend
private Binary() {}

Expand Down Expand Up @@ -101,6 +113,18 @@ public boolean equals(Object obj) {
return false;
}

/**
* Caches {@code hashCode} for non-reused instances and returns it. The cache uses
* a single int field with sentinel {@code 0} to remain race-safe without volatile.
* If the computed hash is {@code 0}, no caching occurs and the next call recomputes.
*/
final int cacheHashCode(int hashCode) {
if (!isBackingBytesReused) {
cachedHashCode = hashCode;
}
return hashCode;
}

@Override
public String toString() {
return "Binary{" + length()
Expand Down Expand Up @@ -180,7 +204,11 @@ public Binary slice(int start, int length) {

@Override
public int hashCode() {
return Binary.hashCode(value, offset, length);
int h = cachedHashCode;
if (h != 0) {
return h;
}
return cacheHashCode(Binary.hashCode(value, offset, length));
}

@Override
Expand Down Expand Up @@ -340,7 +368,11 @@ public Binary slice(int start, int length) {

@Override
public int hashCode() {
return Binary.hashCode(value, 0, value.length);
int h = cachedHashCode;
if (h != 0) {
return h;
}
return cacheHashCode(Binary.hashCode(value, 0, value.length));
}

@Override
Expand Down Expand Up @@ -499,11 +531,18 @@ public Binary slice(int start, int length) {

@Override
public int hashCode() {
int h = cachedHashCode;
if (h != 0) {
return h;
}

int computedHashCode;
if (value.hasArray()) {
return Binary.hashCode(value.array(), value.arrayOffset() + offset, length);
computedHashCode = Binary.hashCode(value.array(), value.arrayOffset() + offset, length);
} else {
return Binary.hashCode(value, offset, length);
computedHashCode = Binary.hashCode(value, offset, length);
}
return cacheHashCode(computedHashCode);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,51 @@ public void testEqualityMethods() throws Exception {
assertEquals(bin1, bin2);
}

/**
* Verifies that {@link Binary#hashCode()} is cached for non-reused (constant) instances:
* the value returned must be stable, equal across the three concrete Binary
* implementations for the same bytes, and consistent with {@link Object#equals(Object)}.
*/
@Test
public void testHashCodeCachedForConstantBinary() {
byte[] bytes = "hash-cache-test".getBytes();

Binary[] constants = {
Binary.fromConstantByteArray(bytes),
Binary.fromConstantByteArray(bytes, 0, bytes.length),
Binary.fromConstantByteBuffer(ByteBuffer.wrap(bytes)),
};
int reference = constants[0].hashCode();
for (Binary b : constants) {
int first = b.hashCode();
int second = b.hashCode();
assertEquals("repeated hashCode for " + b.getClass().getSimpleName(), first, second);
assertEquals(
"cross-impl hashCode for " + b.getClass().getSimpleName() + " must equal reference",
reference,
first);
}
}

/**
* Verifies that reused (mutable backing) Binary instances do not return a stale cached
* hash code when their backing bytes change between calls.
*/
@Test
public void testHashCodeNotCachedForReusedBinary() {
byte[] bytes = "first".getBytes();
Binary reused = Binary.fromReusedByteArray(bytes);
int firstHash = reused.hashCode();
int constHashFirst = Binary.fromConstantByteArray(bytes).hashCode();
assertEquals(constHashFirst, firstHash);

byte[] mutated = "second-value".getBytes();
reused = Binary.fromReusedByteArray(mutated);
int secondHash = reused.hashCode();
int constHashSecond = Binary.fromConstantByteArray(mutated).hashCode();
assertEquals(constHashSecond, secondHash);
}

@Test
public void testWriteAllTo() throws Exception {
byte[] orig = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
Expand Down