diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index a18b4b28f..079fbc524 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -9,24 +9,24 @@ name = "tokenizers" crate-type = ["cdylib"] [dependencies] -rayon = "1.8" +rayon = "1.10" serde = { version = "1.0", features = [ "rc", "derive" ]} serde_json = "1.0" libc = "0.2" -env_logger = "0.10.0" -pyo3 = { version = "0.20" } -numpy = "0.20.0" +env_logger = "0.11" +pyo3 = { version = "0.21" } +numpy = "0.21" ndarray = "0.15" onig = { version = "6.4", default-features = false } -itertools = "0.11" +itertools = "0.12" [dependencies.tokenizers] version = "0.16.0-dev.0" path = "../../tokenizers" [dev-dependencies] -tempfile = "3.8" -pyo3 = { version = "0.20", features = ["auto-initialize"] } +tempfile = "3.10" +pyo3 = { version = "0.21", features = ["auto-initialize"] } [features] defaut = ["pyo3/extension-module"] diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index f3d36532a..d436d91f3 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -1,7 +1,6 @@ use std::sync::{Arc, RwLock}; use crate::pre_tokenizers::from_string; -use crate::utils::PyChar; use crate::utils::PyPattern; use pyo3::exceptions; use pyo3::prelude::*; @@ -318,8 +317,8 @@ impl PyMetaspaceDec { } #[setter] - fn set_replacement(self_: PyRef, replacement: PyChar) { - setter!(self_, Metaspace, @set_replacement, replacement.0); + fn set_replacement(self_: PyRef, replacement: char) { + setter!(self_, Metaspace, @set_replacement, replacement); } #[getter] @@ -352,16 +351,16 @@ impl PyMetaspaceDec { } #[new] - #[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\", split = True)")] + #[pyo3(signature = (replacement = '▁', prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\", split = True)")] fn new( - replacement: PyChar, + replacement: char, prepend_scheme: String, split: bool, ) -> PyResult<(Self, PyDecoder)> { let prepend_scheme = from_string(prepend_scheme)?; Ok(( PyMetaspaceDec {}, - Metaspace::new(replacement.0, prepend_scheme, split).into(), + Metaspace::new(replacement, prepend_scheme, split).into(), )) } } @@ -602,7 +601,7 @@ mod test { Python::with_gil(|py| { let py_dec = PyDecoder::new(Metaspace::default().into()); let py_meta = py_dec.get_as_subtype(py).unwrap(); - assert_eq!("Metaspace", py_meta.as_ref(py).get_type().name().unwrap()); + assert_eq!("Metaspace", py_meta.as_ref(py).get_type().qualname().unwrap()); }) } diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 8fce02c94..eb2226c19 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -321,14 +321,14 @@ macro_rules! setter { } #[derive(FromPyObject)] -enum PyVocab<'a> { +enum PyVocab { Vocab(Vocab), - Filename(&'a str), + Filename(String), } #[derive(FromPyObject)] -enum PyMerges<'a> { +enum PyMerges { Merges(Merges), - Filename(&'a str), + Filename(String), } #[pymethods] @@ -870,7 +870,7 @@ mod test { Python::with_gil(|py| { let py_model = PyModel::from(BPE::default()); let py_bpe = py_model.get_as_subtype(py).unwrap(); - assert_eq!("BPE", py_bpe.as_ref(py).get_type().name().unwrap()); + assert_eq!("BPE", py_bpe.as_ref(py).get_type().qualname().unwrap()); }) } diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index 954ee5aa8..e1203e4a7 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -468,10 +468,10 @@ impl PyPrecompiled { #[new] #[pyo3(text_signature = "(self, precompiled_charsmap)")] fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> { - let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?; + let precompiled_charsmap: Vec = FromPyObject::extract(py_precompiled_charsmap)?; Ok(( PyPrecompiled {}, - Precompiled::from(precompiled_charsmap) + Precompiled::from(&precompiled_charsmap) .map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to build Precompiled normalizer: {}", @@ -667,7 +667,7 @@ mod test { Python::with_gil(|py| { let py_norm = PyNormalizer::new(NFC.into()); let py_nfc = py_norm.get_as_subtype(py).unwrap(); - assert_eq!("NFC", py_nfc.as_ref(py).get_type().name().unwrap()); + assert_eq!("NFC", py_nfc.as_ref(py).get_type().qualname().unwrap()); }) } diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 59cc394da..9a21b5edb 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -372,16 +372,16 @@ impl PyCharDelimiterSplit { } #[setter] - fn set_delimiter(self_: PyRef, delimiter: PyChar) { - setter!(self_, Delimiter, delimiter, delimiter.0); + fn set_delimiter(self_: PyRef, delimiter: char) { + setter!(self_, Delimiter, delimiter, delimiter); } #[new] #[pyo3(text_signature = None)] - pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> { + pub fn new(delimiter: char) -> PyResult<(Self, PyPreTokenizer)> { Ok(( PyCharDelimiterSplit {}, - CharDelimiterSplit::new(delimiter.0).into(), + CharDelimiterSplit::new(delimiter).into(), )) } @@ -490,8 +490,8 @@ impl PyMetaspace { } #[setter] - fn set_replacement(self_: PyRef, replacement: PyChar) { - setter!(self_, Metaspace, @set_replacement, replacement.0); + fn set_replacement(self_: PyRef, replacement: char) { + setter!(self_, Metaspace, @set_replacement, replacement); } #[getter] @@ -524,15 +524,15 @@ impl PyMetaspace { } #[new] - #[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")] + #[pyo3(signature = (replacement = '▁', prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")] fn new( - replacement: PyChar, + replacement: char, prepend_scheme: String, split: bool, ) -> PyResult<(Self, PyPreTokenizer)> { // Create a new Metaspace instance let prepend_scheme = from_string(prepend_scheme)?; - let new_instance: Metaspace = Metaspace::new(replacement.0, prepend_scheme, split); + let new_instance: Metaspace = Metaspace::new(replacement, prepend_scheme, split); Ok((PyMetaspace {}, new_instance.into())) } } @@ -754,7 +754,7 @@ mod test { Python::with_gil(|py| { let py_norm = PyPreTokenizer::new(Whitespace {}.into()); let py_wsp = py_norm.get_as_subtype(py).unwrap(); - assert_eq!("Whitespace", py_wsp.as_ref(py).get_type().name().unwrap()); + assert_eq!("Whitespace", py_wsp.as_ref(py).get_type().qualname().unwrap()); }) } diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 55a69287e..da8a0985c 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -304,7 +304,7 @@ impl FromPyObject<'_> for PyTemplate { Ok(Self( s.try_into().map_err(exceptions::PyValueError::new_err)?, )) - } else if let Ok(s) = ob.extract::>() { + } else if let Ok(s) = ob.extract::>() { Ok(Self( s.try_into().map_err(exceptions::PyValueError::new_err)?, )) @@ -474,7 +474,7 @@ mod test { let py_bert = py_proc.get_as_subtype(py).unwrap(); assert_eq!( "BertProcessing", - py_bert.as_ref(py).get_type().name().unwrap() + py_bert.as_ref(py).get_type().qualname().unwrap() ); }) } diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 4e792ef54..b5bdbd17d 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -377,12 +377,12 @@ impl<'s> FromPyObject<'s> for PreTokenizedInputSequence<'s> { return Ok(Self(seq.into())); } if let Ok(s) = ob.downcast::() { - if let Ok(seq) = s.extract::>() { + if let Ok(seq) = s.extract::>() { return Ok(Self(seq.into())); } } if let Ok(s) = ob.downcast::() { - if let Ok(seq) = s.extract::>() { + if let Ok(seq) = s.extract::>() { return Ok(Self(seq.into())); } } diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index 707dc7230..c2772ef3b 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -2,13 +2,13 @@ use std::sync::{Arc, RwLock}; use crate::models::PyModel; use crate::tokenizer::PyAddedToken; -use crate::utils::PyChar; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::types::*; use serde::{Deserialize, Serialize}; use tk::models::TrainerWrapper; use tk::Trainer; +use std::collections::HashSet; use tokenizers as tk; /// Base class for all trainers @@ -269,12 +269,12 @@ impl PyBpeTrainer { } #[setter] - fn set_initial_alphabet(self_: PyRef, alphabet: Vec) { + fn set_initial_alphabet(self_: PyRef, alphabet: HashSet) { setter!( self_, BpeTrainer, initial_alphabet, - alphabet.into_iter().map(|c| c.0).collect() + alphabet ); } @@ -473,12 +473,12 @@ impl PyWordPieceTrainer { } #[setter] - fn set_initial_alphabet(self_: PyRef, alphabet: Vec) { + fn set_initial_alphabet(self_: PyRef, alphabet: HashSet) { setter!( self_, WordPieceTrainer, @set_initial_alphabet, - alphabet.into_iter().map(|c| c.0).collect() + alphabet ); } @@ -801,12 +801,12 @@ impl PyUnigramTrainer { } #[setter] - fn set_initial_alphabet(self_: PyRef, alphabet: Vec) { + fn set_initial_alphabet(self_: PyRef, alphabet: HashSet) { setter!( self_, UnigramTrainer, initial_alphabet, - alphabet.into_iter().map(|c| c.0).collect() + alphabet ); } @@ -893,7 +893,7 @@ mod tests { Python::with_gil(|py| { let py_trainer = PyTrainer::new(Arc::new(RwLock::new(BpeTrainer::default().into()))); let py_bpe = py_trainer.get_as_subtype(py).unwrap(); - assert_eq!("BpeTrainer", py_bpe.as_ref(py).get_type().name().unwrap()); + assert_eq!("BpeTrainer", py_bpe.as_ref(py).get_type().qualname().unwrap()); }) } } diff --git a/bindings/python/src/utils/mod.rs b/bindings/python/src/utils/mod.rs index e7f95f03a..1e409a504 100644 --- a/bindings/python/src/utils/mod.rs +++ b/bindings/python/src/utils/mod.rs @@ -1,6 +1,3 @@ -use pyo3::exceptions; -use pyo3::prelude::*; -use pyo3::types::*; use std::marker::PhantomData; use std::sync::{Arc, Mutex}; @@ -14,25 +11,6 @@ pub use normalization::*; pub use pretokenization::*; pub use regex::*; -// PyChar -// This type is a temporary hack to accept `char` as argument -// To be removed once https://github.com/PyO3/pyo3/pull/1282 has been released -pub struct PyChar(pub char); - -impl FromPyObject<'_> for PyChar { - fn extract(obj: &PyAny) -> PyResult { - let s = >::try_from(obj)?.to_str()?; - let mut iter = s.chars(); - if let (Some(ch), None) = (iter.next(), iter.next()) { - Ok(Self(ch)) - } else { - Err(exceptions::PyValueError::new_err( - "expected a string of length 1", - )) - } - } -} - // RefMut utils pub trait DestroyPtr { diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs index 11a068565..d8cecce14 100644 --- a/bindings/python/src/utils/normalization.rs +++ b/bindings/python/src/utils/normalization.rs @@ -9,15 +9,15 @@ use tk::pattern::Pattern; /// Represents a Pattern as used by `NormalizedString` #[derive(Clone, FromPyObject)] -pub enum PyPattern<'p> { +pub enum PyPattern { #[pyo3(annotation = "str")] - Str(&'p str), + Str(String), #[pyo3(annotation = "tokenizers.Regex")] Regex(Py), // TODO: Add the compatibility for Fn(char) -> bool } -impl Pattern for PyPattern<'_> { +impl Pattern for PyPattern { fn find_matches(&self, inside: &str) -> tk::Result> { match self { PyPattern::Str(s) => { @@ -35,8 +35,8 @@ impl Pattern for PyPattern<'_> { } } -impl From> for tk::normalizers::replace::ReplacePattern { - fn from(pattern: PyPattern<'_>) -> Self { +impl From for tk::normalizers::replace::ReplacePattern { + fn from(pattern: PyPattern) -> Self { match pattern { PyPattern::Str(s) => Self::String(s.to_owned()), PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())), @@ -44,8 +44,8 @@ impl From> for tk::normalizers::replace::ReplacePattern { } } -impl From> for tk::pre_tokenizers::split::SplitPattern { - fn from(pattern: PyPattern<'_>) -> Self { +impl From for tk::pre_tokenizers::split::SplitPattern { + fn from(pattern: PyPattern) -> Self { match pattern { PyPattern::Str(s) => Self::String(s.to_owned()), PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())), diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index f948fe522..477ada53b 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -40,19 +40,19 @@ harness = false lazy_static = "1.4" rand = "0.8" onig = { version = "6.4", default-features = false, optional = true } -regex = "1.9" +regex = "1.10" regex-syntax = "0.8" -rayon = "1.8" +rayon = "1.10" rayon-cond = "0.3" serde = { version = "1.0", features = [ "derive" ] } serde_json = "1.0" unicode-normalization-alignments = "0.1" unicode_categories = "0.1" -unicode-segmentation = "1.10" +unicode-segmentation = "1.11" indicatif = {version = "0.17", optional = true} itertools = "0.12" log = "0.4" -derive_builder = "0.13" +derive_builder = "0.20" spm_precompiled = "0.1" hf-hub = { version = "0.3.2", optional = true } aho-corasick = "1.1" @@ -62,7 +62,7 @@ thiserror = "1.0.49" fancy-regex = { version = "0.13", optional = true} getrandom = { version = "0.2.10" } esaxx-rs = { version = "0.1.10", default-features = false, features=[]} -monostate = "0.1.9" +monostate = "0.1.12" [features] default = ["progressbar", "onig", "esaxx_fast"] @@ -73,7 +73,7 @@ unstable_wasm = ["fancy-regex", "getrandom/js"] [dev-dependencies] criterion = "0.5" -tempfile = "3.8" +tempfile = "3.10" assert_approx_eq = "1.1" [profile.release]