diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index 12b7e382a4..1fae13ab47 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -1202,7 +1202,7 @@ def minhash( num_hashes: The number of hash permutations to compute. ngram_size: The number of tokens in each shingle/ngram. seed (optional): Seed used for generating permutations and the initial string hashes. Defaults to 1. - hash_function (optional): Hash function to use for initial string hashing. One of "murmur3", "xxhash", or "sha1". Defaults to "murmur3". + hash_function (optional): Hash function to use for initial string hashing. One of "murmurhash3", "xxhash", or "sha1". Defaults to "murmurhash3". """ assert isinstance(num_hashes, int) diff --git a/src/daft-sql/src/modules/hashing.rs b/src/daft-sql/src/modules/hashing.rs index 6a3839296b..f83a297d37 100644 --- a/src/daft-sql/src/modules/hashing.rs +++ b/src/daft-sql/src/modules/hashing.rs @@ -95,7 +95,7 @@ impl TryFrom for MinHashFunction { }) }) .transpose()? - .unwrap_or("murmur3"); + .unwrap_or("murmur3hash3"); Ok(Self { num_hashes, diff --git a/tests/series/test_minhash.py b/tests/series/test_minhash.py index 8ddeff2662..499dec8e55 100644 --- a/tests/series/test_minhash.py +++ b/tests/series/test_minhash.py @@ -41,7 +41,7 @@ def minhash_none( @pytest.mark.parametrize("num_hashes", [1, 2, 16, 128]) @pytest.mark.parametrize("ngram_size", [1, 2, 4, 5, 100]) @pytest.mark.parametrize("seed", [1, -1, 123, None]) -@pytest.mark.parametrize("hash_function", ["murmur3", "xxhash", "sha1"]) +@pytest.mark.parametrize("hash_function", ["murmurhash3", "xxhash", "sha1"]) def test_minhash(num_hashes, ngram_size, seed, hash_function): minhash = minhash_none(test_series, num_hashes, ngram_size, seed, hash_function) assert minhash[4] is None and minhash[-1] is None