huggingface · ArthurZucker · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -14,10 +14,12 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
 serde_json = "1.0"
 libc = "0.2"
 env_logger = "0.11"
-pyo3 = { version = "0.21" }
 numpy = "0.21"
 ndarray = "0.15"
 itertools = "0.12"
+derive_more = "0.99.17"
+pyo3 = { version = "0.21", features = ["multiple-pymethods"] }
+pyo3_special_method_derive_0_21 = {path = "../../../pyo3-special-method-derive/pyo3_special_method_derive_0_21"} 
 
 [dependencies.tokenizers]
 path = "../../tokenizers"

diff --git a/bindings/python/grep b/bindings/python/grep
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -5,6 +5,7 @@ use crate::utils::PyPattern;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
+use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay, Repr, Str};
 use serde::de::Error;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use tk::decoders::bpe::BPEDecoder;
@@ -28,9 +29,11 @@ use super::error::ToPyResult;
 /// This class is not supposed to be instantiated directly. Instead, any implementation of
 /// a Decoder will return an instance of this class when instantiated.
 #[pyclass(dict, module = "tokenizers.decoders", name = "Decoder", subclass)]
-#[derive(Clone, Deserialize, Serialize)]
+#[derive(Clone, Deserialize, Serialize, Str, Repr)]
+#[format(fmt = "{}")]
 pub struct PyDecoder {
     #[serde(flatten)]
+    #[format]
     pub(crate) decoder: PyDecoderWrapper,
 }
 
@@ -478,9 +481,10 @@ impl PySequenceDecoder {
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, AutoDisplay, AutoDebug)]
 pub(crate) struct CustomDecoder {
-    inner: PyObject,
+    #[format(skip)]
+    pub inner: PyObject,
 }
 
 impl CustomDecoder {
@@ -531,8 +535,9 @@ impl<'de> Deserialize<'de> for CustomDecoder {
     }
 }
 
-#[derive(Clone, Deserialize, Serialize)]
+#[derive(Clone, Deserialize, Serialize, AutoDisplay, AutoDebug)]
 #[serde(untagged)]
+#[format(fmt = "{}")]
 pub(crate) enum PyDecoderWrapper {
     Custom(Arc<RwLock<CustomDecoder>>),
     Wrapped(Arc<RwLock<DecoderWrapper>>),

diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs
@@ -2,11 +2,13 @@ use std::collections::HashMap;
 use std::path::{Path, PathBuf};
 use std::sync::{Arc, RwLock};
 
+use super::error::{deprecation_warning, ToPyResult};
 use crate::token::PyToken;
 use crate::trainers::PyTrainer;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
+use pyo3_special_method_derive_0_21::{Repr, Str};
 use serde::{Deserialize, Serialize};
 use tk::models::bpe::{BpeBuilder, Merges, Vocab, BPE};
 use tk::models::unigram::Unigram;
@@ -16,16 +18,15 @@ use tk::models::ModelWrapper;
 use tk::{Model, Token};
 use tokenizers as tk;
 
-use super::error::{deprecation_warning, ToPyResult};
-
 /// Base class for all models
 ///
 /// The model represents the actual tokenization algorithm. This is the part that
 /// will contain and manage the learned vocabulary.
 ///
 /// This class cannot be constructed directly. Please use one of the concrete models.
 #[pyclass(module = "tokenizers.models", name = "Model", subclass)]
-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize, Str, Repr)]
+#[format(fmt = "{}")]
 pub struct PyModel {
     #[serde(flatten)]
     pub model: Arc<RwLock<ModelWrapper>>,

diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs
@@ -1,11 +1,11 @@
 use std::sync::{Arc, RwLock};
 
+use crate::error::ToPyResult;
+use crate::utils::{PyNormalizedString, PyNormalizedStringRefMut, PyPattern};
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
-
-use crate::error::ToPyResult;
-use crate::utils::{PyNormalizedString, PyNormalizedStringRefMut, PyPattern};
+use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay, Dict, Dir, Repr, Str};
 use serde::ser::SerializeStruct;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use tk::normalizers::{
@@ -43,9 +43,11 @@ impl PyNormalizedStringMut<'_> {
 /// This class is not supposed to be instantiated directly. Instead, any implementation of a
 /// Normalizer will return an instance of this class when instantiated.
 #[pyclass(dict, module = "tokenizers.normalizers", name = "Normalizer", subclass)]
-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize, Str, Repr, Dir)]
+#[format(fmt = "{}")]
 pub struct PyNormalizer {
     #[serde(flatten)]
+    #[format]
     pub(crate) normalizer: PyNormalizerTypeWrapper,
 }
 
@@ -477,7 +479,10 @@ impl PyNmt {
 /// Precompiled normalizer
 /// Don't use manually it is used for compatiblity for SentencePiece.
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
+#[derive(Str)]
+#[format(fmt = "PreCompiled")]
 pub struct PyPrecompiled {}
+
 #[pymethods]
 impl PyPrecompiled {
     #[new]
@@ -513,8 +518,9 @@ impl PyReplace {
     }
 }
 
-#[derive(Debug, Clone)]
+#[derive(AutoDebug, Clone, AutoDisplay)]
 pub(crate) struct CustomNormalizer {
+    #[format(fmt = "Custom Normalizer")]
     inner: PyObject,
 }
 impl CustomNormalizer {
@@ -556,8 +562,9 @@ impl<'de> Deserialize<'de> for CustomNormalizer {
     }
 }
 
-#[derive(Debug, Clone, Deserialize)]
+#[derive(AutoDebug, Clone, Deserialize, AutoDisplay)]
 #[serde(untagged)]
+#[format(fmt = "{}")]
 pub(crate) enum PyNormalizerWrapper {
     Custom(CustomNormalizer),
     Wrapped(NormalizerWrapper),
@@ -575,8 +582,9 @@ impl Serialize for PyNormalizerWrapper {
     }
 }
 
-#[derive(Debug, Clone, Deserialize)]
+#[derive(Clone, Deserialize, AutoDisplay, AutoDebug)]
 #[serde(untagged)]
+#[format(fmt = "{}")]
 pub(crate) enum PyNormalizerTypeWrapper {
     Sequence(Vec<Arc<RwLock<PyNormalizerWrapper>>>),
     Single(Arc<RwLock<PyNormalizerWrapper>>),

diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
@@ -23,7 +23,7 @@ use tokenizers as tk;
 
 use super::error::ToPyResult;
 use super::utils::*;
-
+use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay, Dict, Dir, Repr, Str};
 /// Base class for all pre-tokenizers
 ///
 /// This class is not supposed to be instantiated directly. Instead, any implementation of a
@@ -34,10 +34,12 @@ use super::utils::*;
     name = "PreTokenizer",
     subclass
 )]
-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize, Str, Repr, Dir, Dict)]
+#[format(fmt = "{}")] // don't format the Py wrapper
 pub struct PyPreTokenizer {
     #[serde(flatten)]
-    pub(crate) pretok: PyPreTokenizerTypeWrapper,
+    #[format]
+    pretok: PyPreTokenizerTypeWrapper,
 }
 
 impl PyPreTokenizer {
@@ -425,6 +427,8 @@ impl PyPunctuation {
 
 /// This pre-tokenizer composes other pre_tokenizers and applies them in sequence
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Sequence")]
+#[derive(AutoDisplay)]
+#[format(fmt = "Sequence.{}")]
 pub struct PySequence {}
 #[pymethods]
 impl PySequence {
@@ -587,7 +591,7 @@ impl PyUnicodeScripts {
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, AutoDisplay, AutoDebug)]
 pub(crate) struct CustomPreTokenizer {
     inner: PyObject,
 }
@@ -631,8 +635,9 @@ impl<'de> Deserialize<'de> for CustomPreTokenizer {
     }
 }
 
-#[derive(Clone, Deserialize)]
+#[derive(Clone, Deserialize, AutoDisplay, AutoDebug)]
 #[serde(untagged)]
+#[format(fmt = "{}")]
 pub(crate) enum PyPreTokenizerWrapper {
     Custom(CustomPreTokenizer),
     Wrapped(PreTokenizerWrapper),
@@ -650,8 +655,9 @@ impl Serialize for PyPreTokenizerWrapper {
     }
 }
 
-#[derive(Clone, Deserialize)]
+#[derive(Clone, Deserialize, AutoDisplay, AutoDebug)]
 #[serde(untagged)]
+#[format(fmt = "{}")]
 pub(crate) enum PyPreTokenizerTypeWrapper {
     Sequence(Vec<Arc<RwLock<PyPreTokenizerWrapper>>>),
     Single(Arc<RwLock<PyPreTokenizerWrapper>>),

diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
@@ -1,12 +1,12 @@
 use std::convert::TryInto;
 use std::sync::Arc;
 
+use crate::encoding::PyEncoding;
+use crate::error::ToPyResult;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
-
-use crate::encoding::PyEncoding;
-use crate::error::ToPyResult;
+use pyo3_special_method_derive_0_21::{Repr, Str};
 use serde::{Deserialize, Serialize};
 use tk::processors::bert::BertProcessing;
 use tk::processors::byte_level::ByteLevel;
@@ -27,7 +27,8 @@ use tokenizers as tk;
     name = "PostProcessor",
     subclass
 )]
-#[derive(Clone, Deserialize, Serialize)]
+#[derive(Clone, Deserialize, Serialize, Str, Repr)]
+#[format(fmt = "{}")]
 pub struct PyPostProcessor {
     #[serde(flatten)]
     pub processor: Arc<PostProcessorWrapper>,

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -1,12 +1,23 @@
 use std::collections::{hash_map::DefaultHasher, HashMap};
 use std::hash::{Hash, Hasher};
 
+use super::decoders::PyDecoder;
+use super::encoding::PyEncoding;
+use super::error::{PyError, ToPyResult};
+use super::models::PyModel;
+use super::normalizers::PyNormalizer;
+use super::pre_tokenizers::PyPreTokenizer;
+use super::trainers::PyTrainer;
+use crate::processors::PyPostProcessor;
+use crate::utils::{MaybeSizedIterator, PyBufferedIterator};
 use numpy::{npyffi, PyArray1};
 use pyo3::class::basic::CompareOp;
 use pyo3::exceptions;
 use pyo3::intern;
 use pyo3::prelude::*;
 use pyo3::types::*;
+use pyo3_special_method_derive_0_21::{Repr, Str};
+use std::collections::BTreeMap;
 use tk::models::bpe::BPE;
 use tk::tokenizer::{
     Model, PaddingDirection, PaddingParams, PaddingStrategy, PostProcessor, TokenizerImpl,
@@ -15,17 +26,6 @@ use tk::tokenizer::{
 use tk::utils::iter::ResultShunt;
 use tokenizers as tk;
 
-use super::decoders::PyDecoder;
-use super::encoding::PyEncoding;
-use super::error::{PyError, ToPyResult};
-use super::models::PyModel;
-use super::normalizers::PyNormalizer;
-use super::pre_tokenizers::PyPreTokenizer;
-use super::trainers::PyTrainer;
-use crate::processors::PyPostProcessor;
-use crate::utils::{MaybeSizedIterator, PyBufferedIterator};
-use std::collections::BTreeMap;
-
 /// Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
 /// It can have special options that defines the way it should behave.
 ///
@@ -462,9 +462,10 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
 ///         The core algorithm that this :obj:`Tokenizer` should be using.
 ///
 #[pyclass(dict, module = "tokenizers", name = "Tokenizer")]
-#[derive(Clone)]
+#[derive(Clone, Str, Repr)]
+#[format(fmt = "{}")]
 pub struct PyTokenizer {
-    tokenizer: Tokenizer,
+    pub tokenizer: Tokenizer,
 }
 
 impl PyTokenizer {

diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs
@@ -6,7 +6,6 @@ use pyo3::prelude::*;
 use pyo3::types::*;
 use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehavior};
 use tk::pattern::Pattern;
-
 /// Represents a Pattern as used by `NormalizedString`
 #[derive(Clone, FromPyObject)]
 pub enum PyPattern {

diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml
@@ -63,6 +63,7 @@ fancy-regex = { version = "0.13", optional = true}
 getrandom = { version = "0.2.10" }
 esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
 monostate = "0.1.12"
+pyo3_special_method_derive_0_21 = {path = "../../pyo3-special-method-derive/pyo3_special_method_derive_0_21"}
 
 [features]
 default = ["progressbar", "onig", "esaxx_fast"]

diff --git a/tokenizers/src/decoders/bpe.rs b/tokenizers/src/decoders/bpe.rs
@@ -1,8 +1,7 @@
 use crate::tokenizer::{Decoder, Result};
-
+use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay};
 use serde::{Deserialize, Serialize};
-
-#[derive(Deserialize, Clone, Debug, Serialize)]
+#[derive(Deserialize, Clone, AutoDebug, Serialize, AutoDisplay)]
 /// Allows decoding Original BPE by joining all the tokens and then replacing
 /// the suffix used to identify end-of-words by whitespaces
 #[serde(tag = "type")]

diff --git a/tokenizers/src/decoders/byte_fallback.rs b/tokenizers/src/decoders/byte_fallback.rs
@@ -1,13 +1,13 @@
 use crate::tokenizer::{Decoder, Result};
 use monostate::MustBe;
-
+use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay};
 use serde::{Deserialize, Serialize};
-
-#[derive(Deserialize, Clone, Debug, Serialize, Default)]
+#[derive(Deserialize, Clone, AutoDebug, Serialize, Default, AutoDisplay)]
 /// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
 /// to pure bytes, and attempts to make them into a string. If the tokens
 /// cannot be decoded you will get � instead for each inconvertable byte token
 #[non_exhaustive]
+#[format(fmt = "ByteFallback")]
 pub struct ByteFallback {
     #[serde(rename = "type")]
     type_: MustBe!("ByteFallback"),

diff --git a/tokenizers/src/decoders/ctc.rs b/tokenizers/src/decoders/ctc.rs
@@ -1,10 +1,10 @@
 use crate::decoders::wordpiece;
 use crate::tokenizer::{Decoder, Result};
-
 use itertools::Itertools;
+use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay};
 use serde::{Deserialize, Serialize};
 
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(AutoDebug, Clone, Serialize, Deserialize, AutoDisplay)]
 /// The CTC (Connectionist Temporal Classification) decoder takes care
 /// of sanitizing a list of inputs token.
 /// Due to some alignement problem the output of some models can come

diff --git a/tokenizers/src/decoders/fuse.rs b/tokenizers/src/decoders/fuse.rs
@@ -1,13 +1,14 @@
 use crate::tokenizer::{Decoder, Result};
 use monostate::MustBe;
+use pyo3_special_method_derive_0_21::{AutoDebug, AutoDisplay};
 use serde::{Deserialize, Serialize};
-
-#[derive(Clone, Debug, Serialize, Deserialize, Default)]
+#[derive(Clone, AutoDebug, Serialize, Deserialize, Default, AutoDisplay)]
 /// Fuse simply fuses all tokens into one big string.
 /// It's usually the last decoding step anyway, but this
 /// decoder exists incase some decoders need to happen after that
 /// step
 #[non_exhaustive]
+#[format(fmt = "Fuse")]
 pub struct Fuse {
     #[serde(rename = "type")]
     type_: MustBe!("Fuse"),