Skip to content

Commit

Permalink
AVRO-4060: Use JDK to Hash Byte Array in UTF8
Browse files Browse the repository at this point in the history
  • Loading branch information
belugabehr committed Dec 29, 2024
1 parent 34ce192 commit 07d2f10
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 3 deletions.
17 changes: 14 additions & 3 deletions lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ public Utf8(byte[] bytes) {
this.length = length;
}

Utf8(String string, int length) {
this(string);
this.length = length;
}

/**
* Return UTF-8 encoded bytes. Only valid through {@link #getByteLength()}
* assuming the bytes have been fully copied into the underlying buffer from the
Expand Down Expand Up @@ -173,9 +178,15 @@ public int hashCode() {
if (h == 0) {
byte[] bytes = this.bytes;
int length = this.length;
h = 1;
for (int i = 0; i < length; i++) {
h = h * 31 + bytes[i];
// If the array is filled, use the underlying JDK hash functionality.
// Starting with JDK 21, the underlying implementation is vectorized.
if (length > 7 && bytes.length == length) {
h = Arrays.hashCode(bytes);
} else {
h = 1;
for (int i = 0; i < length; i++) {
h = h * 31 + bytes[i];
}
}
this.hash = h;
}
Expand Down
20 changes: 20 additions & 0 deletions lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,26 @@ void hashCodeReused() {
assertEquals(4122302, u.hashCode());
}

/**
* There are two different code paths that hashcode() can call depending on the
* state of the internal buffer. If the buffer is full (string length eq. buffer
* length) then the JDK hashcode function can be used. This function can is
* vectorized JDK 21+ and therefore should be preferable. However, if the buffer
* is not full (string length le. buffer length), then the JDK does not support
* this and a scalar implementation is the only option as of today. This
* difference can be resolved with JDK 23 as it supports both cases.
*/
@Test
void hashCodeBasedOnCapacity() {
// string = 3; buffer = 3
Utf8 fullCapacity = new Utf8("abcdefgh", 8);

// string = 3; buffer = 4
Utf8 partialCapacity = new Utf8("abcdefghX", 8);

assertEquals(fullCapacity.hashCode(), partialCapacity.hashCode());
}

@Test
void oversizeUtf8() {
Utf8 u = new Utf8();
Expand Down

0 comments on commit 07d2f10

Please sign in to comment.