From 8654d09beb302bbf1da1f2bf132d9b7808d4ca81 Mon Sep 17 00:00:00 2001 From: Andrew Gazelka Date: Tue, 22 Oct 2024 16:18:49 -0700 Subject: [PATCH] fix ci --- Cargo.lock | 4 +++ Cargo.toml | 1 + src/daft-core/Cargo.toml | 4 +-- src/daft-core/src/python/series.rs | 39 ++++++++++++++++++----------- src/daft-functions/Cargo.toml | 2 +- src/daft-functions/src/minhash.rs | 37 +++------------------------ src/daft-hash/Cargo.toml | 7 ++++++ src/daft-hash/src/lib.rs | 34 ++++++++++++++++++++++++- src/daft-minhash/Cargo.toml | 3 ++- src/daft-minhash/benches/minhash.rs | 3 ++- src/daft-sql/src/modules/hashing.rs | 2 +- 11 files changed, 81 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e61a6ac98..c6d5c8f43a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1924,7 +1924,10 @@ dependencies = [ name = "daft-hash" version = "0.3.0-dev0" dependencies = [ + "common-error", "mur3", + "pyo3", + "serde", "sha1 0.11.0-pre.4", ] @@ -2080,6 +2083,7 @@ version = "0.3.0-dev0" dependencies = [ "ahash", "common-error", + "daft-hash", "divan", "fastrand 2.1.0", "mur3", diff --git a/Cargo.toml b/Cargo.toml index 99bb422fc6..1195d7373a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -155,6 +155,7 @@ bytes = "1.6.0" chrono = "0.4.38" chrono-tz = "0.8.4" comfy-table = "7.1.1" +common-error = {path = "src/common/error", default-features = false} daft-hash = {path = "src/daft-hash"} derivative = "2.2.0" divan = "0.1.14" diff --git a/src/daft-core/Cargo.toml b/src/daft-core/Cargo.toml index 92a3e10de3..7648c4a011 100644 --- a/src/daft-core/Cargo.toml +++ b/src/daft-core/Cargo.toml @@ -24,7 +24,7 @@ common-display = {path = "../common/display", default-features = false} common-error = {path = "../common/error", default-features = false} common-hashable-float-wrapper = {path = "../common/hashable-float-wrapper"} common-py-serde = {path = "../common/py-serde", default-features = false} -daft-hash = {workspace = true} +daft-hash = {workspace = true, features = ["python"]} daft-minhash = {path = "../daft-minhash", default-features = false} daft-schema = {path = "../daft-schema", default-features = false} daft-sketch = {path = "../daft-sketch", default-features = false} @@ -51,7 +51,7 @@ optional = true version = "0.21.0" [dependencies.xxhash-rust] -features = ["xxh3", "const_xxh3"] +features = ["xxh3", "const_xxh3", "xxh64"] version = "0.8.5" [features] diff --git a/src/daft-core/src/python/series.rs b/src/daft-core/src/python/series.rs index 1c7a763193..03e105e809 100644 --- a/src/daft-core/src/python/series.rs +++ b/src/daft-core/src/python/series.rs @@ -1,7 +1,10 @@ -use std::ops::{Add, Div, Mul, Rem, Sub}; +use std::{ + hash::BuildHasherDefault, + ops::{Add, Div, Mul, Rem, Sub}, +}; use common_arrow_ffi as ffi; -use daft_hash::MurBuildHasher; +use daft_hash::{HashFunctionKind, MurBuildHasher, Sha1Hasher}; use daft_schema::python::PyDataType; use pyo3::{ exceptions::PyValueError, @@ -10,8 +13,6 @@ use pyo3::{ types::{PyBytes, PyList}, }; -fn x(x: HashFunctionKind) -> PyResult {} - use crate::{ array::{ ops::{ @@ -335,17 +336,27 @@ impl PySeries { "ngram_size must be positive: {ngram_size}" ))); } - let cast_seed = seed as u32; + let seed = seed as u32; - Ok(self - .series - .minhash( - num_hashes as usize, - ngram_size as usize, - cast_seed, - &MurBuildHasher::new(cast_seed), - )? - .into()) + let num_hashes = num_hashes as usize; + let ngram_size = ngram_size as usize; + + let result = match hash_function { + HashFunctionKind::MurmurHash3 => { + let hasher = MurBuildHasher::new(seed); + self.series.minhash(num_hashes, ngram_size, seed, &hasher) + } + HashFunctionKind::XxHash => { + let hasher = xxhash_rust::xxh64::Xxh64Builder::new(seed as u64); + self.series.minhash(num_hashes, ngram_size, seed, &hasher) + } + HashFunctionKind::Sha1 => { + let hasher = BuildHasherDefault::::default(); + self.series.minhash(num_hashes, ngram_size, seed, &hasher) + } + }?; + + Ok(result.into()) } pub fn __richcmp__(&self, other: &Self, op: CompareOp) -> PyResult { diff --git a/src/daft-functions/Cargo.toml b/src/daft-functions/Cargo.toml index 81715342b7..c1a36393e7 100644 --- a/src/daft-functions/Cargo.toml +++ b/src/daft-functions/Cargo.toml @@ -6,7 +6,7 @@ common-hashable-float-wrapper = {path = "../common/hashable-float-wrapper"} common-io-config = {path = "../common/io-config", default-features = false} daft-core = {path = "../daft-core", default-features = false} daft-dsl = {path = "../daft-dsl", default-features = false} -daft-hash = {workspace = true} +daft-hash = {workspace = true, features = ["python"]} daft-image = {path = "../daft-image", default-features = false} daft-io = {path = "../daft-io", default-features = false} futures = {workspace = true} diff --git a/src/daft-functions/src/minhash.rs b/src/daft-functions/src/minhash.rs index 2a3bf2d95c..c0bcb0607c 100644 --- a/src/daft-functions/src/minhash.rs +++ b/src/daft-functions/src/minhash.rs @@ -1,7 +1,4 @@ -use std::{ - hash::{BuildHasher, BuildHasherDefault}, - str::FromStr, -}; +use std::hash::BuildHasherDefault; use common_error::{DaftError, DaftResult}; use daft_core::prelude::*; @@ -9,9 +6,7 @@ use daft_dsl::{ functions::{ScalarFunction, ScalarUDF}, ExprRef, }; -use daft_hash::{MurBuildHasher, Sha1Hasher}; -#[cfg(feature = "python")] -use pyo3::pyclass; +use daft_hash::{HashFunctionKind, MurBuildHasher, Sha1Hasher}; use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] @@ -98,38 +93,12 @@ pub fn minhash( .into() } -/// Format of a file, e.g. Parquet, CSV, JSON. -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Copy)] -#[cfg_attr(feature = "python", pyclass(module = "daft.daft"))] -pub enum HashFunctionKind { - MurmurHash3, - XxHash, - Sha1, -} - -impl FromStr for HashFunctionKind { - type Err = DaftError; - - fn from_str(s: &str) -> Result { - match s { - "murmur3" => Ok(Self::MurmurHash3), - "xxhash" => Ok(Self::XxHash), - "sha1" => Ok(Self::Sha1), - _ => Err(DaftError::ValueError(format!( - "Hash function {} not found", - s - ))), - } - } -} - #[cfg(feature = "python")] pub mod python { use daft_dsl::python::PyExpr; + use daft_hash::HashFunctionKind; use pyo3::{exceptions::PyValueError, pyfunction, PyResult}; - use crate::minhash::HashFunctionKind; - #[pyfunction] pub fn minhash( expr: PyExpr, diff --git a/src/daft-hash/Cargo.toml b/src/daft-hash/Cargo.toml index 5f52a410f6..3af141c7ab 100644 --- a/src/daft-hash/Cargo.toml +++ b/src/daft-hash/Cargo.toml @@ -1,7 +1,14 @@ [dependencies] +common-error = {workspace = true} mur3 = {workspace = true} +pyo3 = {workspace = true, optional = true} # For Python bindings +serde = {workspace = true, features = ["derive"]} sha1 = {workspace = true} +[features] +default = [] +python = ["dep:pyo3"] # Enable pyo3 when python feature is enabled + [lints] workspace = true diff --git a/src/daft-hash/src/lib.rs b/src/daft-hash/src/lib.rs index e0d08beb4a..fb6bcb61e4 100644 --- a/src/daft-hash/src/lib.rs +++ b/src/daft-hash/src/lib.rs @@ -1,7 +1,14 @@ #![feature(split_array)] -use std::hash::{BuildHasher, Hasher}; +use std::{ + hash::{BuildHasher, Hasher}, + str::FromStr, +}; +use common_error::DaftError; +#[cfg(feature = "python")] +use pyo3::prelude::*; +use serde::{Deserialize, Serialize}; use sha1::Digest; pub struct MurBuildHasher { @@ -44,3 +51,28 @@ impl Hasher for Sha1Hasher { self.state.update(bytes); } } + +/// Format of a file, e.g. Parquet, CSV, JSON. +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Copy)] +#[cfg_attr(feature = "python", pyclass(module = "daft.daft"))] +pub enum HashFunctionKind { + MurmurHash3, + XxHash, + Sha1, +} + +impl FromStr for HashFunctionKind { + type Err = DaftError; + + fn from_str(s: &str) -> Result { + match s { + "murmur3" => Ok(Self::MurmurHash3), + "xxhash" => Ok(Self::XxHash), + "sha1" => Ok(Self::Sha1), + _ => Err(DaftError::ValueError(format!( + "Hash function {} not found", + s + ))), + } + } +} diff --git a/src/daft-minhash/Cargo.toml b/src/daft-minhash/Cargo.toml index a8c2dad4aa..1971e586c7 100644 --- a/src/daft-minhash/Cargo.toml +++ b/src/daft-minhash/Cargo.toml @@ -3,13 +3,14 @@ harness = false name = "minhash" [dependencies] -common-error = {path = "../common/error", default-features = false} fastrand = "2.1.0" mur3 = "0.1.0" +common-error.workspace = true [dev-dependencies] xxhash-rust = {workspace = true, features = ["xxh64", "xxh3"]} ahash.workspace = true +daft-hash.workspace = true divan.workspace = true rustc-hash.workspace = true diff --git a/src/daft-minhash/benches/minhash.rs b/src/daft-minhash/benches/minhash.rs index 2c01f8de54..5cbb414962 100644 --- a/src/daft-minhash/benches/minhash.rs +++ b/src/daft-minhash/benches/minhash.rs @@ -3,7 +3,8 @@ use std::{ }; use ahash::AHasher; -use daft_minhash::{load_simd, minhash, MurBuildHasher}; +use daft_hash::MurBuildHasher; +use daft_minhash::{load_simd, minhash}; use divan::{black_box, Bencher}; use rustc_hash::FxHasher; diff --git a/src/daft-sql/src/modules/hashing.rs b/src/daft-sql/src/modules/hashing.rs index eab0d64932..6a3839296b 100644 --- a/src/daft-sql/src/modules/hashing.rs +++ b/src/daft-sql/src/modules/hashing.rs @@ -1,7 +1,7 @@ use daft_dsl::ExprRef; use daft_functions::{ hash::hash, - minhash::{minhash, HashFunctionKind, MinHashFunction}, + minhash::{minhash, MinHashFunction}, }; use sqlparser::ast::FunctionArg;