From 07d2f10198efbbbddfd1c21b7796b22caf73cc87 Mon Sep 17 00:00:00 2001 From: David Mollitor Date: Sun, 29 Dec 2024 00:03:27 -0500 Subject: [PATCH] AVRO-4060: Use JDK to Hash Byte Array in UTF8 --- .../main/java/org/apache/avro/util/Utf8.java | 17 +++++++++++++--- .../java/org/apache/avro/util/TestUtf8.java | 20 +++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java index 22c21c76be5..b609e166c7d 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java +++ b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java @@ -68,6 +68,11 @@ public Utf8(byte[] bytes) { this.length = length; } + Utf8(String string, int length) { + this(string); + this.length = length; + } + /** * Return UTF-8 encoded bytes. Only valid through {@link #getByteLength()} * assuming the bytes have been fully copied into the underlying buffer from the @@ -173,9 +178,15 @@ public int hashCode() { if (h == 0) { byte[] bytes = this.bytes; int length = this.length; - h = 1; - for (int i = 0; i < length; i++) { - h = h * 31 + bytes[i]; + // If the array is filled, use the underlying JDK hash functionality. + // Starting with JDK 21, the underlying implementation is vectorized. + if (length > 7 && bytes.length == length) { + h = Arrays.hashCode(bytes); + } else { + h = 1; + for (int i = 0; i < length; i++) { + h = h * 31 + bytes[i]; + } } this.hash = h; } diff --git a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java index 91618ca5efc..90b77a8da04 100644 --- a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java +++ b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java @@ -99,6 +99,26 @@ void hashCodeReused() { assertEquals(4122302, u.hashCode()); } + /** + * There are two different code paths that hashcode() can call depending on the + * state of the internal buffer. If the buffer is full (string length eq. buffer + * length) then the JDK hashcode function can be used. This function can is + * vectorized JDK 21+ and therefore should be preferable. However, if the buffer + * is not full (string length le. buffer length), then the JDK does not support + * this and a scalar implementation is the only option as of today. This + * difference can be resolved with JDK 23 as it supports both cases. + */ + @Test + void hashCodeBasedOnCapacity() { + // string = 3; buffer = 3 + Utf8 fullCapacity = new Utf8("abcdefgh", 8); + + // string = 3; buffer = 4 + Utf8 partialCapacity = new Utf8("abcdefghX", 8); + + assertEquals(fullCapacity.hashCode(), partialCapacity.hashCode()); + } + @Test void oversizeUtf8() { Utf8 u = new Utf8();