Skip to content

Commit

Permalink
fix ci
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewgazelka committed Oct 23, 2024
1 parent 10d7438 commit 8654d09
Show file tree
Hide file tree
Showing 11 changed files with 81 additions and 55 deletions.
4 changes: 4 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ bytes = "1.6.0"
chrono = "0.4.38"
chrono-tz = "0.8.4"
comfy-table = "7.1.1"
common-error = {path = "src/common/error", default-features = false}
daft-hash = {path = "src/daft-hash"}
derivative = "2.2.0"
divan = "0.1.14"
Expand Down
4 changes: 2 additions & 2 deletions src/daft-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ common-display = {path = "../common/display", default-features = false}
common-error = {path = "../common/error", default-features = false}
common-hashable-float-wrapper = {path = "../common/hashable-float-wrapper"}
common-py-serde = {path = "../common/py-serde", default-features = false}
daft-hash = {workspace = true}
daft-hash = {workspace = true, features = ["python"]}
daft-minhash = {path = "../daft-minhash", default-features = false}
daft-schema = {path = "../daft-schema", default-features = false}
daft-sketch = {path = "../daft-sketch", default-features = false}
Expand All @@ -51,7 +51,7 @@ optional = true
version = "0.21.0"

[dependencies.xxhash-rust]
features = ["xxh3", "const_xxh3"]
features = ["xxh3", "const_xxh3", "xxh64"]
version = "0.8.5"

[features]
Expand Down
39 changes: 25 additions & 14 deletions src/daft-core/src/python/series.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
use std::ops::{Add, Div, Mul, Rem, Sub};
use std::{
hash::BuildHasherDefault,
ops::{Add, Div, Mul, Rem, Sub},
};

use common_arrow_ffi as ffi;
use daft_hash::MurBuildHasher;
use daft_hash::{HashFunctionKind, MurBuildHasher, Sha1Hasher};
use daft_schema::python::PyDataType;
use pyo3::{
exceptions::PyValueError,
Expand All @@ -10,8 +13,6 @@ use pyo3::{
types::{PyBytes, PyList},
};

fn x(x: HashFunctionKind) -> PyResult<HashFunctionKind> {}

use crate::{
array::{
ops::{
Expand Down Expand Up @@ -335,17 +336,27 @@ impl PySeries {
"ngram_size must be positive: {ngram_size}"
)));
}
let cast_seed = seed as u32;
let seed = seed as u32;

Ok(self
.series
.minhash(
num_hashes as usize,
ngram_size as usize,
cast_seed,
&MurBuildHasher::new(cast_seed),
)?
.into())
let num_hashes = num_hashes as usize;
let ngram_size = ngram_size as usize;

let result = match hash_function {
HashFunctionKind::MurmurHash3 => {
let hasher = MurBuildHasher::new(seed);
self.series.minhash(num_hashes, ngram_size, seed, &hasher)
}
HashFunctionKind::XxHash => {
let hasher = xxhash_rust::xxh64::Xxh64Builder::new(seed as u64);
self.series.minhash(num_hashes, ngram_size, seed, &hasher)
}
HashFunctionKind::Sha1 => {
let hasher = BuildHasherDefault::<Sha1Hasher>::default();
self.series.minhash(num_hashes, ngram_size, seed, &hasher)
}
}?;

Ok(result.into())
}

pub fn __richcmp__(&self, other: &Self, op: CompareOp) -> PyResult<Self> {
Expand Down
2 changes: 1 addition & 1 deletion src/daft-functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ common-hashable-float-wrapper = {path = "../common/hashable-float-wrapper"}
common-io-config = {path = "../common/io-config", default-features = false}
daft-core = {path = "../daft-core", default-features = false}
daft-dsl = {path = "../daft-dsl", default-features = false}
daft-hash = {workspace = true}
daft-hash = {workspace = true, features = ["python"]}
daft-image = {path = "../daft-image", default-features = false}
daft-io = {path = "../daft-io", default-features = false}
futures = {workspace = true}
Expand Down
37 changes: 3 additions & 34 deletions src/daft-functions/src/minhash.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
use std::{
hash::{BuildHasher, BuildHasherDefault},
str::FromStr,
};
use std::hash::BuildHasherDefault;

use common_error::{DaftError, DaftResult};
use daft_core::prelude::*;
use daft_dsl::{
functions::{ScalarFunction, ScalarUDF},
ExprRef,
};
use daft_hash::{MurBuildHasher, Sha1Hasher};
#[cfg(feature = "python")]
use pyo3::pyclass;
use daft_hash::{HashFunctionKind, MurBuildHasher, Sha1Hasher};
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -98,38 +93,12 @@ pub fn minhash(
.into()
}

/// Format of a file, e.g. Parquet, CSV, JSON.
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Copy)]
#[cfg_attr(feature = "python", pyclass(module = "daft.daft"))]
pub enum HashFunctionKind {
MurmurHash3,
XxHash,
Sha1,
}

impl FromStr for HashFunctionKind {
type Err = DaftError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"murmur3" => Ok(Self::MurmurHash3),
"xxhash" => Ok(Self::XxHash),
"sha1" => Ok(Self::Sha1),
_ => Err(DaftError::ValueError(format!(
"Hash function {} not found",
s
))),
}
}
}

#[cfg(feature = "python")]
pub mod python {
use daft_dsl::python::PyExpr;
use daft_hash::HashFunctionKind;
use pyo3::{exceptions::PyValueError, pyfunction, PyResult};

use crate::minhash::HashFunctionKind;

#[pyfunction]
pub fn minhash(
expr: PyExpr,
Expand Down
7 changes: 7 additions & 0 deletions src/daft-hash/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
[dependencies]
common-error = {workspace = true}
mur3 = {workspace = true}
pyo3 = {workspace = true, optional = true} # For Python bindings
serde = {workspace = true, features = ["derive"]}
sha1 = {workspace = true}

[features]
default = []
python = ["dep:pyo3"] # Enable pyo3 when python feature is enabled

[lints]
workspace = true

Expand Down
34 changes: 33 additions & 1 deletion src/daft-hash/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
#![feature(split_array)]

use std::hash::{BuildHasher, Hasher};
use std::{
hash::{BuildHasher, Hasher},
str::FromStr,
};

use common_error::DaftError;
#[cfg(feature = "python")]
use pyo3::prelude::*;
use serde::{Deserialize, Serialize};
use sha1::Digest;

pub struct MurBuildHasher {
Expand Down Expand Up @@ -44,3 +51,28 @@ impl Hasher for Sha1Hasher {
self.state.update(bytes);
}
}

/// Format of a file, e.g. Parquet, CSV, JSON.
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Copy)]
#[cfg_attr(feature = "python", pyclass(module = "daft.daft"))]
pub enum HashFunctionKind {
MurmurHash3,
XxHash,
Sha1,
}

impl FromStr for HashFunctionKind {
type Err = DaftError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"murmur3" => Ok(Self::MurmurHash3),
"xxhash" => Ok(Self::XxHash),
"sha1" => Ok(Self::Sha1),
_ => Err(DaftError::ValueError(format!(
"Hash function {} not found",
s
))),
}
}
}
3 changes: 2 additions & 1 deletion src/daft-minhash/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ harness = false
name = "minhash"

[dependencies]
common-error = {path = "../common/error", default-features = false}
fastrand = "2.1.0"
mur3 = "0.1.0"
common-error.workspace = true

[dev-dependencies]
xxhash-rust = {workspace = true, features = ["xxh64", "xxh3"]}
ahash.workspace = true
daft-hash.workspace = true
divan.workspace = true
rustc-hash.workspace = true

Expand Down
3 changes: 2 additions & 1 deletion src/daft-minhash/benches/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use std::{
};

use ahash::AHasher;
use daft_minhash::{load_simd, minhash, MurBuildHasher};
use daft_hash::MurBuildHasher;
use daft_minhash::{load_simd, minhash};
use divan::{black_box, Bencher};
use rustc_hash::FxHasher;

Expand Down
2 changes: 1 addition & 1 deletion src/daft-sql/src/modules/hashing.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use daft_dsl::ExprRef;
use daft_functions::{
hash::hash,
minhash::{minhash, HashFunctionKind, MinHashFunction},
minhash::{minhash, MinHashFunction},
};
use sqlparser::ast::FunctionArg;

Expand Down

0 comments on commit 8654d09

Please sign in to comment.