diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index b494e4085..7b397ed0e 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -14,10 +14,12 @@ serde = { version = "1.0", features = [ "rc", "derive" ]} serde_json = "1.0" libc = "0.2" env_logger = "0.11" -pyo3 = { version = "0.21" } numpy = "0.21" ndarray = "0.15" itertools = "0.12" +derive_more = "0.99.17" +pyo3 = { version = "0.21", features = ["multiple-pymethods"] } +pyo3_special_method_derive_0_21 = {path = "../../../pyo3-special-method-derive/pyo3_special_method_derive_0_21"} [dependencies.tokenizers] path = "../../tokenizers" diff --git a/bindings/python/grep b/bindings/python/grep new file mode 100644 index 000000000..e69de29bb diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index ed21f3469..c5d2365e5 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -5,6 +5,7 @@ use crate::utils::PyPattern; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::types::*; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay, Repr, Str}; use serde::de::Error; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use tk::decoders::bpe::BPEDecoder; @@ -28,9 +29,11 @@ use super::error::ToPyResult; /// This class is not supposed to be instantiated directly. Instead, any implementation of /// a Decoder will return an instance of this class when instantiated. #[pyclass(dict, module = "tokenizers.decoders", name = "Decoder", subclass)] -#[derive(Clone, Deserialize, Serialize)] +#[derive(Clone, Deserialize, Serialize, Str, Repr)] +#[format(fmt = "{}")] pub struct PyDecoder { #[serde(flatten)] + #[format] pub(crate) decoder: PyDecoderWrapper, } @@ -478,9 +481,10 @@ impl PySequenceDecoder { } } -#[derive(Clone)] +#[derive(Clone, AutoDisplay, AutoDebug)] pub(crate) struct CustomDecoder { - inner: PyObject, + #[format(skip)] + pub inner: PyObject, } impl CustomDecoder { @@ -531,8 +535,9 @@ impl<'de> Deserialize<'de> for CustomDecoder { } } -#[derive(Clone, Deserialize, Serialize)] +#[derive(Clone, Deserialize, Serialize, AutoDisplay, AutoDebug)] #[serde(untagged)] +#[format(fmt = "{}")] pub(crate) enum PyDecoderWrapper { Custom(Arc>), Wrapped(Arc>), diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index bffa1bc21..02d838378 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -2,11 +2,13 @@ use std::collections::HashMap; use std::path::{Path, PathBuf}; use std::sync::{Arc, RwLock}; +use super::error::{deprecation_warning, ToPyResult}; use crate::token::PyToken; use crate::trainers::PyTrainer; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::types::*; +use pyo3_special_method_derive_0_21::{Repr, Str}; use serde::{Deserialize, Serialize}; use tk::models::bpe::{BpeBuilder, Merges, Vocab, BPE}; use tk::models::unigram::Unigram; @@ -16,8 +18,6 @@ use tk::models::ModelWrapper; use tk::{Model, Token}; use tokenizers as tk; -use super::error::{deprecation_warning, ToPyResult}; - /// Base class for all models /// /// The model represents the actual tokenization algorithm. This is the part that @@ -25,7 +25,8 @@ use super::error::{deprecation_warning, ToPyResult}; /// /// This class cannot be constructed directly. Please use one of the concrete models. #[pyclass(module = "tokenizers.models", name = "Model", subclass)] -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize, Str, Repr)] +#[format(fmt = "{}")] pub struct PyModel { #[serde(flatten)] pub model: Arc>, diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index 864947e39..0d8aa0edd 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -1,11 +1,11 @@ use std::sync::{Arc, RwLock}; +use crate::error::ToPyResult; +use crate::utils::{PyNormalizedString, PyNormalizedStringRefMut, PyPattern}; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::types::*; - -use crate::error::ToPyResult; -use crate::utils::{PyNormalizedString, PyNormalizedStringRefMut, PyPattern}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay, Dict, Dir, Repr, Str}; use serde::ser::SerializeStruct; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use tk::normalizers::{ @@ -43,9 +43,11 @@ impl PyNormalizedStringMut<'_> { /// This class is not supposed to be instantiated directly. Instead, any implementation of a /// Normalizer will return an instance of this class when instantiated. #[pyclass(dict, module = "tokenizers.normalizers", name = "Normalizer", subclass)] -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize, Str, Repr, Dir)] +#[format(fmt = "{}")] pub struct PyNormalizer { #[serde(flatten)] + #[format] pub(crate) normalizer: PyNormalizerTypeWrapper, } @@ -477,7 +479,10 @@ impl PyNmt { /// Precompiled normalizer /// Don't use manually it is used for compatiblity for SentencePiece. #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")] +#[derive(Str)] +#[format(fmt = "PreCompiled")] pub struct PyPrecompiled {} + #[pymethods] impl PyPrecompiled { #[new] @@ -513,8 +518,9 @@ impl PyReplace { } } -#[derive(Debug, Clone)] +#[derive(AutoDebug, Clone, AutoDisplay)] pub(crate) struct CustomNormalizer { + #[format(fmt = "Custom Normalizer")] inner: PyObject, } impl CustomNormalizer { @@ -556,8 +562,9 @@ impl<'de> Deserialize<'de> for CustomNormalizer { } } -#[derive(Debug, Clone, Deserialize)] +#[derive(AutoDebug, Clone, Deserialize, AutoDisplay)] #[serde(untagged)] +#[format(fmt = "{}")] pub(crate) enum PyNormalizerWrapper { Custom(CustomNormalizer), Wrapped(NormalizerWrapper), @@ -575,8 +582,9 @@ impl Serialize for PyNormalizerWrapper { } } -#[derive(Debug, Clone, Deserialize)] +#[derive(Clone, Deserialize, AutoDisplay, AutoDebug)] #[serde(untagged)] +#[format(fmt = "{}")] pub(crate) enum PyNormalizerTypeWrapper { Sequence(Vec>>), Single(Arc>), diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index a2bd9b39c..43787d053 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -23,7 +23,7 @@ use tokenizers as tk; use super::error::ToPyResult; use super::utils::*; - +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay, Dict, Dir, Repr, Str}; /// Base class for all pre-tokenizers /// /// This class is not supposed to be instantiated directly. Instead, any implementation of a @@ -34,10 +34,12 @@ use super::utils::*; name = "PreTokenizer", subclass )] -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize, Str, Repr, Dir, Dict)] +#[format(fmt = "{}")] // don't format the Py wrapper pub struct PyPreTokenizer { #[serde(flatten)] - pub(crate) pretok: PyPreTokenizerTypeWrapper, + #[format] + pretok: PyPreTokenizerTypeWrapper, } impl PyPreTokenizer { @@ -425,6 +427,8 @@ impl PyPunctuation { /// This pre-tokenizer composes other pre_tokenizers and applies them in sequence #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Sequence")] +#[derive(AutoDisplay)] +#[format(fmt = "Sequence.{}")] pub struct PySequence {} #[pymethods] impl PySequence { @@ -587,7 +591,7 @@ impl PyUnicodeScripts { } } -#[derive(Clone)] +#[derive(Clone, AutoDisplay, AutoDebug)] pub(crate) struct CustomPreTokenizer { inner: PyObject, } @@ -631,8 +635,9 @@ impl<'de> Deserialize<'de> for CustomPreTokenizer { } } -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, AutoDisplay, AutoDebug)] #[serde(untagged)] +#[format(fmt = "{}")] pub(crate) enum PyPreTokenizerWrapper { Custom(CustomPreTokenizer), Wrapped(PreTokenizerWrapper), @@ -650,8 +655,9 @@ impl Serialize for PyPreTokenizerWrapper { } } -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, AutoDisplay, AutoDebug)] #[serde(untagged)] +#[format(fmt = "{}")] pub(crate) enum PyPreTokenizerTypeWrapper { Sequence(Vec>>), Single(Arc>), diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index c46d8ea49..aee5a142b 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -1,12 +1,12 @@ use std::convert::TryInto; use std::sync::Arc; +use crate::encoding::PyEncoding; +use crate::error::ToPyResult; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::types::*; - -use crate::encoding::PyEncoding; -use crate::error::ToPyResult; +use pyo3_special_method_derive_0_21::{Repr, Str}; use serde::{Deserialize, Serialize}; use tk::processors::bert::BertProcessing; use tk::processors::byte_level::ByteLevel; @@ -27,7 +27,8 @@ use tokenizers as tk; name = "PostProcessor", subclass )] -#[derive(Clone, Deserialize, Serialize)] +#[derive(Clone, Deserialize, Serialize, Str, Repr)] +#[format(fmt = "{}")] pub struct PyPostProcessor { #[serde(flatten)] pub processor: Arc, diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 1c6bc9cc1..68cf2100d 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -1,12 +1,23 @@ use std::collections::{hash_map::DefaultHasher, HashMap}; use std::hash::{Hash, Hasher}; +use super::decoders::PyDecoder; +use super::encoding::PyEncoding; +use super::error::{PyError, ToPyResult}; +use super::models::PyModel; +use super::normalizers::PyNormalizer; +use super::pre_tokenizers::PyPreTokenizer; +use super::trainers::PyTrainer; +use crate::processors::PyPostProcessor; +use crate::utils::{MaybeSizedIterator, PyBufferedIterator}; use numpy::{npyffi, PyArray1}; use pyo3::class::basic::CompareOp; use pyo3::exceptions; use pyo3::intern; use pyo3::prelude::*; use pyo3::types::*; +use pyo3_special_method_derive_0_21::{Repr, Str}; +use std::collections::BTreeMap; use tk::models::bpe::BPE; use tk::tokenizer::{ Model, PaddingDirection, PaddingParams, PaddingStrategy, PostProcessor, TokenizerImpl, @@ -15,17 +26,6 @@ use tk::tokenizer::{ use tk::utils::iter::ResultShunt; use tokenizers as tk; -use super::decoders::PyDecoder; -use super::encoding::PyEncoding; -use super::error::{PyError, ToPyResult}; -use super::models::PyModel; -use super::normalizers::PyNormalizer; -use super::pre_tokenizers::PyPreTokenizer; -use super::trainers::PyTrainer; -use crate::processors::PyPostProcessor; -use crate::utils::{MaybeSizedIterator, PyBufferedIterator}; -use std::collections::BTreeMap; - /// Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`. /// It can have special options that defines the way it should behave. /// @@ -462,9 +462,10 @@ type Tokenizer = TokenizerImpl` /// to pure bytes, and attempts to make them into a string. If the tokens /// cannot be decoded you will get � instead for each inconvertable byte token #[non_exhaustive] +#[format(fmt = "ByteFallback")] pub struct ByteFallback { #[serde(rename = "type")] type_: MustBe!("ByteFallback"), diff --git a/tokenizers/src/decoders/ctc.rs b/tokenizers/src/decoders/ctc.rs index 2798638d4..bfabf223b 100644 --- a/tokenizers/src/decoders/ctc.rs +++ b/tokenizers/src/decoders/ctc.rs @@ -1,10 +1,10 @@ use crate::decoders::wordpiece; use crate::tokenizer::{Decoder, Result}; - use itertools::Itertools; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(AutoDebug, Clone, Serialize, Deserialize, AutoDisplay)] /// The CTC (Connectionist Temporal Classification) decoder takes care /// of sanitizing a list of inputs token. /// Due to some alignement problem the output of some models can come diff --git a/tokenizers/src/decoders/fuse.rs b/tokenizers/src/decoders/fuse.rs index 5e4a1c119..43636f8c8 100644 --- a/tokenizers/src/decoders/fuse.rs +++ b/tokenizers/src/decoders/fuse.rs @@ -1,13 +1,14 @@ use crate::tokenizer::{Decoder, Result}; use monostate::MustBe; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; - -#[derive(Clone, Debug, Serialize, Deserialize, Default)] +#[derive(Clone, AutoDebug, Serialize, Deserialize, Default, AutoDisplay)] /// Fuse simply fuses all tokens into one big string. /// It's usually the last decoding step anyway, but this /// decoder exists incase some decoders need to happen after that /// step #[non_exhaustive] +#[format(fmt = "Fuse")] pub struct Fuse { #[serde(rename = "type")] type_: MustBe!("Fuse"), diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs index 682e63b50..a0a270536 100644 --- a/tokenizers/src/decoders/mod.rs +++ b/tokenizers/src/decoders/mod.rs @@ -10,8 +10,6 @@ pub mod wordpiece; pub use super::pre_tokenizers::byte_level; pub use super::pre_tokenizers::metaspace; -use serde::{Deserialize, Serialize}; - use crate::decoders::bpe::BPEDecoder; use crate::decoders::byte_fallback::ByteFallback; use crate::decoders::ctc::CTC; @@ -23,8 +21,12 @@ use crate::normalizers::replace::Replace; use crate::pre_tokenizers::byte_level::ByteLevel; use crate::pre_tokenizers::metaspace::Metaspace; use crate::{Decoder, Result}; +use pyo3_special_method_derive_0_21::AutoDebug; +use pyo3_special_method_derive_0_21::AutoDisplay; +use serde::{Deserialize, Serialize}; -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, Clone, AutoDebug, AutoDisplay)] +#[format(fmt = "decoders.{}")] #[serde(untagged)] pub enum DecoderWrapper { BPE(BPEDecoder), diff --git a/tokenizers/src/decoders/sequence.rs b/tokenizers/src/decoders/sequence.rs index 73169b695..4fd57a97e 100644 --- a/tokenizers/src/decoders/sequence.rs +++ b/tokenizers/src/decoders/sequence.rs @@ -1,10 +1,11 @@ use crate::decoders::DecoderWrapper; use crate::tokenizer::{Decoder, Result}; use crate::utils::macro_rules_attribute; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; -#[derive(Clone, Debug)] #[macro_rules_attribute(impl_serde_type!)] +#[derive(Clone, AutoDebug, AutoDisplay)] pub struct Sequence { decoders: Vec, } diff --git a/tokenizers/src/decoders/strip.rs b/tokenizers/src/decoders/strip.rs index b095fc37e..fee40b4e1 100644 --- a/tokenizers/src/decoders/strip.rs +++ b/tokenizers/src/decoders/strip.rs @@ -1,8 +1,8 @@ use crate::tokenizer::{Decoder, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; - -#[derive(Deserialize, Clone, Debug, Serialize, Default)] +#[derive(Deserialize, Clone, AutoDebug, Serialize, Default, AutoDisplay)] /// Strip is a simple trick which converts tokens looking like `<0x61>` /// to pure bytes, and attempts to make them into a string. If the tokens /// cannot be decoded you will get � instead for each inconvertable byte token diff --git a/tokenizers/src/decoders/wordpiece.rs b/tokenizers/src/decoders/wordpiece.rs index 8ecd3987c..f7b3aacde 100644 --- a/tokenizers/src/decoders/wordpiece.rs +++ b/tokenizers/src/decoders/wordpiece.rs @@ -1,8 +1,8 @@ use crate::tokenizer::{Decoder, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; - -#[derive(Deserialize, Clone, Debug, Serialize)] +#[derive(Deserialize, Clone, AutoDebug, Serialize, AutoDisplay)] /// The WordPiece decoder takes care of decoding a list of wordpiece tokens /// back into a readable string. #[serde(tag = "type")] diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 657d810ba..acbc47ee5 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -23,7 +23,7 @@ //! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant //! that, for example, a language model would need, such as special tokens. //! -//! ## Loading a pretrained tokenizer from the Hub +//! ## Loading a pretrained tokenizer from the Hub. //! ``` //! use tokenizers::tokenizer::{Result, Tokenizer}; //! @@ -39,7 +39,7 @@ //! } //! ``` //! -//! ## Deserialization and tokenization example +//! ## Deserialization and tokenization example. //! //! ```no_run //! use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput}; diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 1585da761..838e16f53 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -2,6 +2,7 @@ use super::{super::OrderedVocabIter, trainer::BpeTrainer, Error, Pair, Word}; use crate::tokenizer::{Model, Result, Token}; use crate::utils::cache::{Cache, DEFAULT_CACHE_CAPACITY}; use crate::utils::iter::ResultShunt; +use pyo3_special_method_derive_0_21::{AutoDisplay, PyDebug}; use serde_json::Value; use std::borrow::Cow; use std::{ @@ -204,7 +205,7 @@ impl BpeBuilder { } /// A [Byte Pair Encoding](https://www.aclweb.org/anthology/P16-1162/) model. -#[derive(PartialEq)] +#[derive(PartialEq, AutoDisplay)] pub struct BPE { /// The vocabulary assigns a number to each token. pub(crate) vocab: Vocab, @@ -247,7 +248,62 @@ impl std::fmt::Debug for BPE { .finish() } } - +// impl std::fmt::Display for BPE { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// let mut vocab_vec: Vec<_> = self.vocab.iter().collect(); +// vocab_vec.sort_by_key(|&(_, v)| v); +// vocab_vec.truncate(5); + +// let vocab_str: String = vocab_vec +// .iter() +// .map(|(k, v)| format!("'{}':{}", k, v)) +// .collect::>() +// .join(", "); + +// let mut merges_vec: Vec<_> = self.merges.iter().collect(); +// merges_vec.truncate(5); +// merges_vec.sort_by_key(|&(_, v)| v); + +// let merges_str: String = merges_vec +// .iter() +// .map(|((id1, id2), _)| { +// ( +// self.vocab_r +// .get(id1) +// .cloned() +// .unwrap_or_else(|| id1.to_string()), +// self.vocab_r +// .get(id2) +// .cloned() +// .unwrap_or_else(|| id2.to_string()), +// ) +// }) +// .map(|(id1, id2)| format!("('{}', '{}')", id1, id2)) +// .collect::>() +// .join(", "); + +// write!( +// f, +// "BPE(vocab={{{}, ...}}, merges=[{:?}, ...], dropout={:?}, unk_token={:?}, continuing_subword_prefix={:?}, end_of_word_suffix={:?}, fuse_unk={}, byte_fallback={}, ignore_merges={})", +// vocab_str, +// merges_str, +// self.dropout, +// self.unk_token, +// self.continuing_subword_prefix, +// self.end_of_word_suffix, +// self.fuse_unk, +// self.byte_fallback, +// self.ignore_merges +// ) +// } +// } + +// That is the only annouying part, explicit implementation. We can have PyDebugOnly. +impl PyDebug for BPE { + fn fmt_debug(&self) -> std::string::String { + format!("{:?}", self) + } +} impl Default for BPE { fn default() -> Self { Self::builder().build().unwrap() diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs index bb7cebc4c..68146045f 100644 --- a/tokenizers/src/models/mod.rs +++ b/tokenizers/src/models/mod.rs @@ -8,6 +8,7 @@ pub mod wordpiece; use std::collections::HashMap; use std::path::{Path, PathBuf}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize, Serializer}; use crate::models::bpe::{BpeTrainer, BPE}; @@ -57,8 +58,9 @@ impl<'a> Serialize for OrderedVocabIter<'a> { } } -#[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] +#[derive(Deserialize, Serialize, AutoDebug, PartialEq, Clone, AutoDisplay)] #[serde(untagged)] +#[format(fmt = "models.{}")] // TODO by default this should define the finale render {}, {} {} . or {}{}{} pub enum ModelWrapper { BPE(BPE), // WordPiece must stay before WordLevel here for deserialization (for retrocompatibility diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs index defc7d93d..d810a4cfd 100644 --- a/tokenizers/src/models/unigram/model.rs +++ b/tokenizers/src/models/unigram/model.rs @@ -6,15 +6,16 @@ use super::{ use crate::tokenizer::{Model, Result, Token}; use crate::utils::cache::Cache; +use pyo3_special_method_derive_0_21::{AutoDisplay, PyDebug}; use std::collections::HashMap; use std::convert::TryInto; use std::fs::read_to_string; use std::path::{Path, PathBuf}; - type TokenMap = HashMap; type Vocab = Vec<(String, f64)>; /// A `Unigram` model to encode sentences. +#[derive(AutoDisplay)] pub struct Unigram { token_to_ids: TokenMap, pub(crate) vocab: Vocab, @@ -65,7 +66,11 @@ impl std::fmt::Debug for Unigram { .finish() } } - +impl PyDebug for Unigram { + fn fmt_debug(&self) -> std::string::String { + format!("{:?}", self) + } +} static K_UNK_PENALTY: f64 = 10.0; #[derive(thiserror::Error, Debug)] diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs index 3482ffee0..2cf9057a2 100644 --- a/tokenizers/src/models/wordlevel/mod.rs +++ b/tokenizers/src/models/wordlevel/mod.rs @@ -1,11 +1,11 @@ use super::OrderedVocabIter; use crate::tokenizer::{Model, Result, Token}; +use pyo3_special_method_derive_0_21::{AutoDisplay, PyDebug}; use serde_json::Value; use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Read, Write}; use std::path::{Path, PathBuf}; - mod serialization; mod trainer; @@ -94,8 +94,9 @@ impl WordLevelBuilder { } } -#[derive(PartialEq, Clone, Eq)] +#[derive(PartialEq, Clone, Eq, AutoDisplay)] pub struct WordLevel { + #[format] vocab: HashMap, vocab_r: HashMap, pub unk_token: String, @@ -109,7 +110,11 @@ impl std::fmt::Debug for WordLevel { .finish() } } - +impl PyDebug for WordLevel { + fn fmt_debug(&self) -> std::string::String { + format!("{:?}", self) + } +} impl WordLevel { pub fn builder() -> WordLevelBuilder { WordLevelBuilder::new() diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index a75134d2c..a3b2997ce 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -3,6 +3,7 @@ use crate::models::bpe::BPE; use crate::tokenizer::{Model, Result, Token}; +use pyo3_special_method_derive_0_21::{AutoDisplay, PyDebug}; use std::{ borrow::Cow, collections::HashMap, @@ -11,7 +12,6 @@ use std::{ io::{BufRead, BufReader}, path::{Path, PathBuf}, }; - mod serialization; mod trainer; pub use trainer::*; @@ -119,12 +119,14 @@ impl WordPieceBuilder { /// A /// [WordPiece](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37842.pdf) /// model. -#[derive(Clone, PartialEq, Eq)] +#[derive(Clone, PartialEq, Eq, AutoDisplay)] pub struct WordPiece { + #[format] vocab: Vocab, vocab_r: VocabR, pub unk_token: String, pub continuing_subword_prefix: String, + #[format(skip)] pub max_input_chars_per_word: usize, } @@ -138,7 +140,11 @@ impl std::fmt::Debug for WordPiece { .finish() } } - +impl PyDebug for WordPiece { + fn fmt_debug(&self) -> std::string::String { + format!("{:?}", self) + } +} impl Default for WordPiece { fn default() -> Self { Self { diff --git a/tokenizers/src/normalizers/bert.rs b/tokenizers/src/normalizers/bert.rs index 90d982c68..9cf9a5e2b 100644 --- a/tokenizers/src/normalizers/bert.rs +++ b/tokenizers/src/normalizers/bert.rs @@ -1,8 +1,8 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use unicode_categories::UnicodeCategories; - /// Checks whether a character is whitespace fn is_whitespace(c: char) -> bool { // These are technically control characters but we count them as whitespace @@ -47,7 +47,7 @@ fn is_chinese_char(c: char) -> bool { ) } -#[derive(Copy, Clone, Debug, Deserialize, Serialize)] +#[derive(Copy, Clone, AutoDebug, Deserialize, Serialize, AutoDisplay)] #[serde(tag = "type")] #[non_exhaustive] pub struct BertNormalizer { diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index 42c7fa510..95f38dd96 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -1,9 +1,9 @@ use crate::processors::byte_level::bytes_char; use crate::tokenizer::{NormalizedString, Normalizer, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; - -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, AutoDebug, Deserialize, Serialize, AutoDisplay)] #[serde(tag = "type")] pub struct ByteLevel {} diff --git a/tokenizers/src/normalizers/mod.rs b/tokenizers/src/normalizers/mod.rs index c5144be14..cbd98a07e 100644 --- a/tokenizers/src/normalizers/mod.rs +++ b/tokenizers/src/normalizers/mod.rs @@ -17,10 +17,12 @@ pub use crate::normalizers::utils::{Lowercase, Sequence}; use serde::{Deserialize, Serialize}; use crate::{NormalizedString, Normalizer}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; /// Wrapper for known Normalizers. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Deserialize, Serialize, AutoDisplay, AutoDebug)] #[serde(untagged)] +#[format(fmt = "normalizers.{}")] pub enum NormalizerWrapper { BertNormalizer(BertNormalizer), StripNormalizer(Strip), @@ -32,6 +34,7 @@ pub enum NormalizerWrapper { Sequence(Sequence), Lowercase(Lowercase), Nmt(Nmt), + #[format(fmt = "Precompiled")] Precompiled(Precompiled), Replace(Replace), Prepend(Prepend), diff --git a/tokenizers/src/normalizers/prepend.rs b/tokenizers/src/normalizers/prepend.rs index 4e318c259..936f9006a 100644 --- a/tokenizers/src/normalizers/prepend.rs +++ b/tokenizers/src/normalizers/prepend.rs @@ -1,7 +1,8 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, AutoDebug, Deserialize, Serialize, AutoDisplay)] #[serde(tag = "type")] pub struct Prepend { pub prepend: String, diff --git a/tokenizers/src/normalizers/replace.rs b/tokenizers/src/normalizers/replace.rs index cdd4a420a..df6d4c005 100644 --- a/tokenizers/src/normalizers/replace.rs +++ b/tokenizers/src/normalizers/replace.rs @@ -2,10 +2,10 @@ use crate::tokenizer::pattern::Pattern; use crate::tokenizer::Decoder; use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::utils::SysRegex; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; - /// Represents the different patterns that `Replace` can use -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] +#[derive(AutoDebug, AutoDisplay, Clone, PartialEq, Serialize, Deserialize, Eq)] pub enum ReplacePattern { String(String), Regex(String), @@ -42,12 +42,14 @@ impl std::convert::TryFrom for Replace { /// This normalizer will take a `pattern` (for now only a String) /// and replace every occurrence with `content`. -#[derive(Debug, Serialize, Deserialize)] +#[derive(AutoDebug, Serialize, Deserialize, AutoDisplay)] #[serde(tag = "type", try_from = "ReplaceDeserializer")] pub struct Replace { pattern: ReplacePattern, + #[format] content: String, #[serde(skip)] + #[format(skip)] regex: SysRegex, } diff --git a/tokenizers/src/normalizers/strip.rs b/tokenizers/src/normalizers/strip.rs index 19f5ff314..ec3b83a68 100644 --- a/tokenizers/src/normalizers/strip.rs +++ b/tokenizers/src/normalizers/strip.rs @@ -1,9 +1,9 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::utils::macro_rules_attribute; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use unicode_normalization_alignments::char::is_combining_mark; - -#[derive(Copy, Clone, Debug, Deserialize, Serialize)] +#[derive(Copy, Clone, AutoDebug, Deserialize, Serialize, AutoDisplay)] #[serde(tag = "type")] #[non_exhaustive] pub struct Strip { @@ -43,7 +43,7 @@ impl Normalizer for Strip { // This normalizer removes combining marks from a normalized string // It's different from unidecode as it does not attempt to modify // non ascii languages. -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, AutoDebug, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct StripAccents; diff --git a/tokenizers/src/normalizers/unicode.rs b/tokenizers/src/normalizers/unicode.rs index 502b4239b..4a2498722 100644 --- a/tokenizers/src/normalizers/unicode.rs +++ b/tokenizers/src/normalizers/unicode.rs @@ -1,7 +1,8 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::utils::macro_rules_attribute; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; -#[derive(Default, Copy, Clone, Debug)] +#[derive(Default, Copy, Clone, AutoDebug, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct NFD; impl Normalizer for NFD { @@ -11,7 +12,7 @@ impl Normalizer for NFD { } } -#[derive(Default, Copy, Clone, Debug)] +#[derive(Default, Copy, Clone, AutoDebug, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct NFKD; impl Normalizer for NFKD { @@ -21,7 +22,7 @@ impl Normalizer for NFKD { } } -#[derive(Default, Copy, Clone, Debug)] +#[derive(Default, Copy, Clone, AutoDebug, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct NFC; impl Normalizer for NFC { @@ -31,7 +32,7 @@ impl Normalizer for NFC { } } -#[derive(Default, Copy, Clone, Debug)] +#[derive(Default, Copy, Clone, AutoDebug, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct NFKC; impl Normalizer for NFKC { @@ -72,7 +73,7 @@ fn do_nmt(normalized: &mut NormalizedString) { }); } -#[derive(Default, Copy, Clone, Debug)] +#[derive(Default, Copy, Clone, AutoDebug, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct Nmt; impl Normalizer for Nmt { diff --git a/tokenizers/src/normalizers/utils.rs b/tokenizers/src/normalizers/utils.rs index a7730a3f8..77359e608 100644 --- a/tokenizers/src/normalizers/utils.rs +++ b/tokenizers/src/normalizers/utils.rs @@ -3,8 +3,8 @@ use serde::{Deserialize, Serialize}; use crate::normalizers::NormalizerWrapper; use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::utils::macro_rules_attribute; - -#[derive(Clone, Deserialize, Debug, Serialize)] +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; +#[derive(Clone, Deserialize, AutoDebug, Serialize, AutoDisplay)] #[serde(tag = "type")] /// Allows concatenating multiple other Normalizer as a Sequence. /// All the normalizers run in sequence in the given order against the same NormalizedString. @@ -36,9 +36,10 @@ impl Normalizer for Sequence { } /// Lowercases the input -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, AutoDebug, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct Lowercase; + impl Normalizer for Lowercase { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { normalized.lowercase(); diff --git a/tokenizers/src/pre_tokenizers/bert.rs b/tokenizers/src/pre_tokenizers/bert.rs index 93fdd05c1..0030f785d 100644 --- a/tokenizers/src/pre_tokenizers/bert.rs +++ b/tokenizers/src/pre_tokenizers/bert.rs @@ -1,12 +1,13 @@ use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::utils::macro_rules_attribute; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use unicode_categories::UnicodeCategories; fn is_bert_punc(x: char) -> bool { char::is_ascii_punctuation(&x) || x.is_punctuation() } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct BertPreTokenizer; diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 2d3845b55..577c2ad58 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -1,13 +1,13 @@ use std::collections::{HashMap, HashSet}; -use crate::utils::SysRegex; -use serde::{Deserialize, Serialize}; - use crate::tokenizer::{ Decoder, Encoding, PostProcessor, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior, }; use crate::utils::macro_rules_attribute; +use crate::utils::SysRegex; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; +use serde::{Deserialize, Serialize}; /// Converts bytes to unicode characters. /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9 @@ -46,11 +46,11 @@ lazy_static! { bytes_char().into_iter().map(|(c, b)| (b, c)).collect(); } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care /// of all the required processing steps to transform a UTF-8 string as needed before and after the /// BPE model does its job. #[macro_rules_attribute(impl_serde_type!)] +#[derive(Copy, Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] #[non_exhaustive] pub struct ByteLevel { /// Whether to add a leading space to the first word. This allows to treat the leading word diff --git a/tokenizers/src/pre_tokenizers/delimiter.rs b/tokenizers/src/pre_tokenizers/delimiter.rs index 64ef63ccc..2f81d4eeb 100644 --- a/tokenizers/src/pre_tokenizers/delimiter.rs +++ b/tokenizers/src/pre_tokenizers/delimiter.rs @@ -1,9 +1,10 @@ +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::utils::macro_rules_attribute; -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] #[non_exhaustive] #[macro_rules_attribute(impl_serde_type!)] pub struct CharDelimiterSplit { diff --git a/tokenizers/src/pre_tokenizers/digits.rs b/tokenizers/src/pre_tokenizers/digits.rs index 942e2521b..5fb76a6e4 100644 --- a/tokenizers/src/pre_tokenizers/digits.rs +++ b/tokenizers/src/pre_tokenizers/digits.rs @@ -1,9 +1,10 @@ +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::utils::macro_rules_attribute; -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] /// Pre tokenizes the numbers into single tokens. If individual_digits is set /// to true, then all digits are splitted into individual tokens. #[non_exhaustive] diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 52b415c9b..f0b1ba28c 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -1,8 +1,8 @@ use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{de, Deserialize, Deserializer, Serialize}; - /// Enum representing options for the metaspace prepending scheme. -#[derive(Debug, Clone, PartialEq, Serialize, Eq, Deserialize, Copy)] +#[derive(AutoDebug, Clone, PartialEq, Serialize, Eq, Deserialize, Copy, AutoDisplay)] #[serde(rename_all = "snake_case")] pub enum PrependScheme { /// Specifies that the scheme should be prepended only once, on the first split. @@ -13,7 +13,7 @@ pub enum PrependScheme { Always, } -#[derive(Debug, Clone, PartialEq, Serialize, Eq)] +#[derive(AutoDebug, Clone, PartialEq, Serialize, Eq, AutoDisplay)] /// Replaces all the whitespaces by the provided meta character and then /// splits on this character #[serde(tag = "type")] diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index cf64fb876..1b35a3971 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -22,9 +22,11 @@ use crate::pre_tokenizers::split::Split; use crate::pre_tokenizers::unicode_scripts::UnicodeScripts; use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit}; use crate::{PreTokenizedString, PreTokenizer}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; -#[derive(Deserialize, Serialize, Clone, Debug, PartialEq)] +#[derive(Deserialize, Serialize, Clone, PartialEq, AutoDebug, AutoDisplay)] #[serde(untagged)] +#[format(fmt = "pre_tokenizers.{}")] pub enum PreTokenizerWrapper { BertPreTokenizer(BertPreTokenizer), ByteLevel(ByteLevel), diff --git a/tokenizers/src/pre_tokenizers/punctuation.rs b/tokenizers/src/pre_tokenizers/punctuation.rs index 0ba7d6025..dbfd1b29a 100644 --- a/tokenizers/src/pre_tokenizers/punctuation.rs +++ b/tokenizers/src/pre_tokenizers/punctuation.rs @@ -1,3 +1,4 @@ +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; @@ -8,7 +9,7 @@ fn is_punc(x: char) -> bool { char::is_ascii_punctuation(&x) || x.is_punctuation() } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct Punctuation { #[serde(default = "default_split")] diff --git a/tokenizers/src/pre_tokenizers/sequence.rs b/tokenizers/src/pre_tokenizers/sequence.rs index 9dcafc673..98a6a06c1 100644 --- a/tokenizers/src/pre_tokenizers/sequence.rs +++ b/tokenizers/src/pre_tokenizers/sequence.rs @@ -1,12 +1,14 @@ use crate::pre_tokenizers::PreTokenizerWrapper; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result}; use crate::utils::macro_rules_attribute; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; -#[derive(Clone, Debug, PartialEq)] #[macro_rules_attribute(impl_serde_type!)] +#[derive(Clone, PartialEq, AutoDisplay, AutoDebug)] pub struct Sequence { - pretokenizers: Vec, + #[format] + pub pretokenizers: Vec, } impl Sequence { diff --git a/tokenizers/src/pre_tokenizers/split.rs b/tokenizers/src/pre_tokenizers/split.rs index 0e2a9023b..55b9b3d2a 100644 --- a/tokenizers/src/pre_tokenizers/split.rs +++ b/tokenizers/src/pre_tokenizers/split.rs @@ -1,12 +1,12 @@ -use crate::utils::SysRegex; -use serde::{Deserialize, Deserializer, Serialize}; - use crate::tokenizer::{ pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior, }; +use crate::utils::SysRegex; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; +use serde::{Deserialize, Deserializer, Serialize}; /// Represents the different patterns that `Split` can use -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq, AutoDisplay)] pub enum SplitPattern { String(String), Regex(String), @@ -24,7 +24,7 @@ impl From<&str> for SplitPattern { } } -#[derive(Debug, Serialize)] +#[derive(AutoDebug, Serialize, AutoDisplay)] #[serde(tag = "type")] pub struct Split { pattern: SplitPattern, diff --git a/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs b/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs index 2b6b54eb6..df4c2e794 100644 --- a/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs +++ b/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs @@ -1,8 +1,10 @@ +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; + use crate::pre_tokenizers::unicode_scripts::scripts::{get_script, Script}; use crate::tokenizer::{normalizer::Range, PreTokenizedString, PreTokenizer, Result}; use crate::utils::macro_rules_attribute; -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct UnicodeScripts; diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 8c24e8efb..cd38ee445 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -1,3 +1,4 @@ +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use regex::Regex; use crate::tokenizer::{ @@ -5,7 +6,7 @@ use crate::tokenizer::{ }; use crate::utils::macro_rules_attribute; -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct Whitespace; @@ -28,7 +29,7 @@ impl PreTokenizer for Whitespace { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] #[macro_rules_attribute(impl_serde_type!)] pub struct WhitespaceSplit; diff --git a/tokenizers/src/processors/bert.rs b/tokenizers/src/processors/bert.rs index 627f9d180..9fd1c91e6 100644 --- a/tokenizers/src/processors/bert.rs +++ b/tokenizers/src/processors/bert.rs @@ -1,9 +1,10 @@ use crate::tokenizer::{Encoding, PostProcessor, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::iter::FromIterator; -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] #[serde(tag = "type")] pub struct BertProcessing { sep: (String, u32), diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index 130a537ba..266a23051 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -6,16 +6,17 @@ pub mod template; // Re-export these as processors pub use super::pre_tokenizers::byte_level; -use serde::{Deserialize, Serialize}; - use crate::pre_tokenizers::byte_level::ByteLevel; use crate::processors::bert::BertProcessing; use crate::processors::roberta::RobertaProcessing; use crate::processors::sequence::Sequence; use crate::processors::template::TemplateProcessing; use crate::{Encoding, PostProcessor, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; +use serde::{Deserialize, Serialize}; -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone, Eq)] +#[derive(Serialize, Deserialize, PartialEq, AutoDebug, Clone, Eq, AutoDisplay)] +#[format(fmt = "processors.{}")] #[serde(untagged)] pub enum PostProcessorWrapper { // Roberta must be before Bert for deserialization (serde does not validate tags) diff --git a/tokenizers/src/processors/roberta.rs b/tokenizers/src/processors/roberta.rs index 3af9a8d60..1dc52d1e2 100644 --- a/tokenizers/src/processors/roberta.rs +++ b/tokenizers/src/processors/roberta.rs @@ -1,10 +1,11 @@ use crate::processors::byte_level::process_offsets; use crate::tokenizer::{Encoding, PostProcessor, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::iter::FromIterator; -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Deserialize, AutoDebug, Clone, PartialEq, Eq, AutoDisplay)] #[serde(tag = "type")] pub struct RobertaProcessing { sep: (String, u32), diff --git a/tokenizers/src/processors/sequence.rs b/tokenizers/src/processors/sequence.rs index 66c670ad8..6be62720d 100644 --- a/tokenizers/src/processors/sequence.rs +++ b/tokenizers/src/processors/sequence.rs @@ -1,10 +1,10 @@ use crate::processors::PostProcessorWrapper; use crate::tokenizer::{Encoding, PostProcessor, Result}; use crate::utils::macro_rules_attribute; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; - -#[derive(Clone, Debug, PartialEq, Eq)] #[macro_rules_attribute(impl_serde_type!)] +#[derive(Clone, AutoDebug, PartialEq, Eq, AutoDisplay)] pub struct Sequence { processors: Vec, } diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs index 9259180d0..6a6fe49e1 100644 --- a/tokenizers/src/processors/template.rs +++ b/tokenizers/src/processors/template.rs @@ -58,13 +58,13 @@ //! use crate::{Encoding, PostProcessor, Result}; use itertools::Itertools; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::convert::{TryFrom, TryInto}; use std::result::Result as StdResult; - /// Represents any sequences received as input of the PostProcessor -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq, AutoDisplay)] pub enum Sequence { /// This is the first sequence, the one that is always specified A, @@ -92,7 +92,7 @@ pub enum Sequence { /// /// [`SpecialToken`]: struct.SpecialToken.html /// -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq, AutoDisplay)] pub enum Piece { Sequence { id: Sequence, type_id: u32 }, SpecialToken { id: String, type_id: u32 }, @@ -250,7 +250,7 @@ impl SpecialToken { /// /// [`Piece`]: enum.Piece.html /// -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] +#[derive(AutoDebug, Clone, PartialEq, Serialize, Deserialize, Eq, AutoDisplay)] #[serde(transparent)] pub struct Template(Vec); @@ -333,18 +333,22 @@ impl From> for Tokens { /// .unwrap(); /// ``` /// -#[derive(Debug, Clone, PartialEq, Builder, Serialize, Deserialize, Eq)] +#[derive(AutoDebug, Clone, PartialEq, Builder, Serialize, Deserialize, Eq, AutoDisplay)] #[serde(tag = "type", from = "TemplateProcessingDeserializer")] #[builder(build_fn(validate = "Self::validate"))] +#[format(fmt = "TemplateProcessing: {}")] pub struct TemplateProcessing { #[builder(try_setter, default = "\"$0\".try_into().unwrap()")] + #[format] single: Template, #[builder(try_setter, default = "\"$A:0 $B:1\".try_into().unwrap()")] + #[format] pair: Template, #[builder(setter(skip), default = "self.default_added(true)")] #[serde(skip)] added_single: usize, #[builder(setter(skip), default = "self.default_added(false)")] + #[format] #[serde(skip)] added_pair: usize, #[builder(setter(into), default)] diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index a0c2f4542..03c569aff 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -2,6 +2,7 @@ use super::{ normalizer::Range, Model, NormalizedString, Normalizer, Offsets, PreTokenizedString, Token, }; use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use regex::Regex; use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer}; use std::collections::{HashMap, HashSet}; @@ -11,7 +12,7 @@ use std::collections::{HashMap, HashSet}; /// like: /// - Whether they should only match single words /// - Whether to include any whitespace on its left or right -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, AutoDisplay, AutoDebug)] pub struct AddedToken { /// The content of the added token pub content: String, @@ -138,13 +139,14 @@ fn space_rightmost_at_start(sentence: &str) -> usize { /// were to add new tokens after this training process, we couldn't make sure the merges pairs /// exist as required. /// -#[derive(Clone, Debug)] +#[derive(Clone, AutoDisplay, AutoDebug)] pub struct AddedVocabulary { /// Contains the mapping from String (token content) to ID. This map contains both special /// tokens and classic added tokens that were added to the this vocabulary. added_tokens_map: HashMap, /// Contains the mapping from ID to AddedToken for all the added tokens, both special /// and classic. + #[format(fmt = "added_token_decoder={}")] added_tokens_map_r: HashMap, /// Contains only the classic AddedToken, in the specific order the user gave them. @@ -154,6 +156,7 @@ pub struct AddedVocabulary { /// A Set, containing all the special token for easy access while decoding. This let's /// us remove them easily with an O(1) complexity. + #[format] special_tokens_set: HashSet, /// A RegexSet containing all the non-normalized patterns used to split on AddedTokens @@ -162,6 +165,7 @@ pub struct AddedVocabulary { split_normalized_trie: MatchingSet, /// Whether or not special tokens should be splitted when encoding. This is equivalent to ignoring them + #[format] encode_special_tokens: bool, } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 766ee1cd9..b91ff5b9d 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -17,14 +17,14 @@ use std::{ ops::{Deref, DerefMut}, path::{Path, PathBuf}, }; - -use serde::de::DeserializeOwned; -use serde::{Deserialize, Serialize}; - +extern crate rayon; use crate::utils::iter::ResultShunt; use crate::utils::parallelism::*; use crate::utils::progress::{ProgressBar, ProgressStyle}; - +use pyo3_special_method_derive_0_21::{AutoDisplay, PyDebug, PyDisplay}; +use rayon::current_thread_index; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; mod added_vocabulary; mod encoding; pub mod normalizer; @@ -405,7 +405,7 @@ where } } -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, Clone, AutoDisplay)] pub struct Tokenizer( TokenizerImpl< ModelWrapper, @@ -508,7 +508,7 @@ impl DerefMut for Tokenizer { pub struct TruncationParamError(String); /// A `Tokenizer` is capable of encoding/decoding any text. -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct TokenizerImpl { // Tokenizer parts normalizer: Option, @@ -525,6 +525,77 @@ pub struct TokenizerImpl { padding: Option, } +impl PyDebug for TokenizerImpl +where + M: PyDebug, + N: PyDebug, + PT: PyDebug, + PP: PyDebug, + D: PyDebug, +{ + fn fmt_debug(&self) -> std::string::String { + format!( + "Tokenizer(normalizer={}, pre_tokenizer={}, model={}, post_processor={}, decoder={}, added_tokens_decoder={}, truncation={}, padding={})", + self.normalizer.fmt_debug(), + self.pre_tokenizer.fmt_debug(), + self.model.fmt_debug(), + self.post_processor.fmt_debug(), + self.decoder.fmt_debug(), + self.added_vocabulary.fmt_debug(), + self.truncation.fmt_debug(), + self.padding.fmt_debug() + ) + } +} + +impl PyDisplay for TokenizerImpl +where + M: PyDisplay, + N: PyDisplay, + PT: PyDisplay, + PP: PyDisplay, + D: PyDisplay, +{ + fn fmt_display(&self) -> std::string::String { + let normalizer_str = match &self.normalizer { + Some(n) => n.fmt_display().to_string(), + None => "None".to_string(), + }; + let pre_tokenizer_str = match &self.pre_tokenizer { + Some(pt) => pt.fmt_display().to_string(), + None => "None".to_string(), + }; + let post_processor_str = match &self.post_processor { + Some(pp) => pp.fmt_display().to_string(), + None => "None".to_string(), + }; + let decoder_str = match &self.decoder { + Some(d) => d.fmt_display().to_string(), + None => "None".to_string(), + }; + let truncation_str = match &self.truncation { + Some(t) => t.fmt_display().to_string(), + None => "None".to_string(), + }; + let padding_str = match &self.padding { + Some(p) => p.fmt_display().to_string(), + None => "None".to_string(), + }; + + format!( + "Tokenizer(normalizer={}, pre_tokenizer={}, model={}, post_processor={}, decoder={}, added_tokens_decoder={}, truncation={}, padding={})", + normalizer_str, + pre_tokenizer_str, + self.model.fmt_display(), + post_processor_str, + decoder_str, + self.added_vocabulary.fmt_display(), + truncation_str, + padding_str + ) + } +} + impl TokenizerImpl where M: Model, @@ -791,7 +862,7 @@ where EncodeInput::Single(s1) => (s1, None), EncodeInput::Dual(s1, s2) => (s1, Some(s2)), }; - + println!("thread id: {:?}", current_thread_index()); // Encode each sequence let encoding = self.encode_single_sequence(sequence, 0, OffsetType::Byte)?; let pair_encoding = pair @@ -880,6 +951,7 @@ where word_idx: Option, offsets_type: OffsetType, ) -> Result { + println!("do tokenizer {:?}", current_thread_index()); let mut pretokenized: PreTokenizedString = pretokenized.into(); pretokenized.tokenize(|normalized| self.model.tokenize(normalized.get()))?; pretokenized.into_encoding(word_idx, type_id, offsets_type) @@ -1297,6 +1369,7 @@ where #[cfg(test)] mod test { + #[cfg(feature = "http")] #[test] fn test_decoding_with_added_bpe() { diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index e2f501abe..73fed1506 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -1,10 +1,10 @@ use crate::pattern::Pattern; use crate::{Offsets, Result}; +use pyo3_special_method_derive_0_21::AutoDisplay; +use serde::{Deserialize, Serialize}; use std::ops::{Bound, RangeBounds}; use unicode_normalization_alignments::UnicodeNormalization; -use serde::{Deserialize, Serialize}; - /// The possible offsets referential #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum OffsetReferential { @@ -78,7 +78,7 @@ where /// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]` /// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]` /// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]` -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq, AutoDisplay)] pub enum SplitDelimiterBehavior { Removed, Isolated, diff --git a/tokenizers/src/utils/padding.rs b/tokenizers/src/utils/padding.rs index 39585a304..318951398 100644 --- a/tokenizers/src/utils/padding.rs +++ b/tokenizers/src/utils/padding.rs @@ -1,9 +1,10 @@ use crate::parallelism::*; use crate::tokenizer::{Encoding, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; /// The various possible padding directions. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +#[derive(AutoDebug, Clone, Copy, Serialize, Deserialize, AutoDisplay)] pub enum PaddingDirection { Left, Right, @@ -18,7 +19,7 @@ impl std::convert::AsRef for PaddingDirection { } } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(AutoDebug, Clone, Serialize, Deserialize, AutoDisplay)] pub struct PaddingParams { pub strategy: PaddingStrategy, pub direction: PaddingDirection, @@ -41,7 +42,7 @@ impl Default for PaddingParams { } } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(AutoDebug, Clone, Serialize, Deserialize, AutoDisplay)] pub enum PaddingStrategy { BatchLongest, Fixed(usize), diff --git a/tokenizers/src/utils/truncation.rs b/tokenizers/src/utils/truncation.rs index a8ad2a614..b11766bba 100644 --- a/tokenizers/src/utils/truncation.rs +++ b/tokenizers/src/utils/truncation.rs @@ -1,9 +1,10 @@ use crate::tokenizer::{Encoding, Result}; +use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay}; use serde::{Deserialize, Serialize}; use std::cmp; use std::mem; -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq, Default)] +#[derive(AutoDebug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq, Default, AutoDisplay)] pub enum TruncationDirection { Left, #[default] @@ -19,10 +20,11 @@ impl std::convert::AsRef for TruncationDirection { } } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(AutoDebug, Clone, Serialize, Deserialize, AutoDisplay)] pub struct TruncationParams { #[serde(default)] pub direction: TruncationDirection, + #[format(skip)] pub max_length: usize, pub strategy: TruncationStrategy, pub stride: usize, @@ -49,7 +51,7 @@ pub enum TruncationError { SequenceTooShort, } -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)] +#[derive(AutoDebug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq, AutoDisplay)] pub enum TruncationStrategy { LongestFirst, OnlyFirst,