diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 4a4af94dd..1a03a7721 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -114,6 +114,16 @@ impl PyDecoder { fn decode(&self, tokens: Vec) -> PyResult { ToPyResult(self.decoder.decode(tokens)).into() } + + fn __repr__(&self) -> PyResult { + crate::utils::serde_pyo3::repr(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } + + fn __str__(&self) -> PyResult { + crate::utils::serde_pyo3::to_string(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } } macro_rules! getter { diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 2bfaafd34..424be9f57 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -220,6 +220,16 @@ impl PyModel { fn get_trainer(&self, py: Python<'_>) -> PyResult { PyTrainer::from(self.model.read().unwrap().get_trainer()).get_as_subtype(py) } + + fn __repr__(&self) -> PyResult { + crate::utils::serde_pyo3::repr(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } + + fn __str__(&self) -> PyResult { + crate::utils::serde_pyo3::to_string(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } } /// An implementation of the BPE (Byte-Pair Encoding) algorithm diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index 724e79b85..51c1e8bfe 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -169,6 +169,16 @@ impl PyNormalizer { ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?; Ok(normalized.get().to_owned()) } + + fn __repr__(&self) -> PyResult { + crate::utils::serde_pyo3::repr(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } + + fn __str__(&self) -> PyResult { + crate::utils::serde_pyo3::to_string(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } } macro_rules! getter { diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index a9060ec3b..4b97319d3 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -181,6 +181,16 @@ impl PyPreTokenizer { .map(|(s, o, _)| (s.to_owned(), o)) .collect()) } + + fn __repr__(&self) -> PyResult { + crate::utils::serde_pyo3::repr(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } + + fn __str__(&self) -> PyResult { + crate::utils::serde_pyo3::to_string(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } } macro_rules! getter { diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index aceb1d446..1d8e8dfac 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -139,6 +139,16 @@ impl PyPostProcessor { .into_py()?; Ok(final_encoding.into()) } + + fn __repr__(&self) -> PyResult { + crate::utils::serde_pyo3::repr(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } + + fn __str__(&self) -> PyResult { + crate::utils::serde_pyo3::to_string(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } } /// This post-processor takes care of adding the special tokens needed by diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 6faeb7ad7..f41bf335f 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -640,8 +640,12 @@ impl PyTokenizer { ToPyResult(self.tokenizer.save(path, pretty)).into() } - #[pyo3(signature = ())] - fn repr(&self) -> PyResult { + fn __repr__(&self) -> PyResult { + crate::utils::serde_pyo3::repr(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } + + fn __str__(&self) -> PyResult { crate::utils::serde_pyo3::to_string(self) .map_err(|e| exceptions::PyException::new_err(e.to_string())) } diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index cbce2aef9..c71442298 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -69,6 +69,16 @@ impl PyTrainer { Err(e) => Err(e), } } + + fn __repr__(&self) -> PyResult { + crate::utils::serde_pyo3::repr(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } + + fn __str__(&self) -> PyResult { + crate::utils::serde_pyo3::to_string(self) + .map_err(|e| exceptions::PyException::new_err(e.to_string())) + } } impl Trainer for PyTrainer { diff --git a/bindings/python/src/utils/serde_pyo3.rs b/bindings/python/src/utils/serde_pyo3.rs index c5fc6453c..b7a189bd7 100644 --- a/bindings/python/src/utils/serde_pyo3.rs +++ b/bindings/python/src/utils/serde_pyo3.rs @@ -2,12 +2,17 @@ use serde::de::value::Error; use serde::{ser, Serialize}; type Result = ::std::result::Result; -const MAX_DEPTH: usize = 5; - pub struct Serializer { // This string starts empty and JSON is appended as values are serialized. output: String, + /// Each levels remembers its own number of elements + num_elements: Vec, + max_elements: usize, level: usize, + max_depth: usize, + /// Maximum string representation + /// Useful to ellipsis precompiled_charmap + max_string: usize, } // By convention, the public API of a Serde serializer is one or more `to_abc` @@ -19,9 +24,34 @@ pub fn to_string(value: &T) -> Result where T: Serialize, { + let max_depth = 20; + let max_elements = 6; + let max_string = 100; + let mut serializer = Serializer { + output: String::new(), + level: 0, + max_depth, + max_elements, + num_elements: vec![0; max_depth], + max_string + }; + value.serialize(&mut serializer)?; + Ok(serializer.output) +} + +pub fn repr(value: &T) -> Result +where + T: Serialize, +{ + let max_depth = 200; + let max_string = usize::MAX; let mut serializer = Serializer { output: String::new(), level: 0, + max_depth, + max_elements: 100, + num_elements: vec![0; max_depth], + max_string }; value.serialize(&mut serializer)?; Ok(serializer.output) @@ -55,11 +85,6 @@ impl<'a> ser::Serializer for &'a mut Serializer { // of the primitive types of the data model and map it to JSON by appending // into the output string. fn serialize_bool(self, v: bool) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } self.output += if v { "True" } else { "False" }; Ok(()) } @@ -83,11 +108,6 @@ impl<'a> ser::Serializer for &'a mut Serializer { // Not particularly efficient but this is example code anyway. A more // performant approach would be to use the `itoa` crate. fn serialize_i64(self, v: i64) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } self.output += &v.to_string(); Ok(()) } @@ -105,11 +125,6 @@ impl<'a> ser::Serializer for &'a mut Serializer { } fn serialize_u64(self, v: u64) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } self.output += &v.to_string(); Ok(()) } @@ -119,11 +134,6 @@ impl<'a> ser::Serializer for &'a mut Serializer { } fn serialize_f64(self, v: f64) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } self.output += &v.to_string(); Ok(()) } @@ -138,14 +148,14 @@ impl<'a> ser::Serializer for &'a mut Serializer { // get the idea. For example it would emit invalid JSON if the input string // contains a '"' character. fn serialize_str(self, v: &str) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { + self.output += "\""; + if v.len() > self.max_string{ + self.output += &v[..self.max_string]; self.output += "..."; - return Ok(()); + }else{ + self.output += v; } self.output += "\""; - self.output += v; - self.output += "\""; Ok(()) } @@ -181,11 +191,6 @@ impl<'a> ser::Serializer for &'a mut Serializer { // In Serde, unit means an anonymous value containing no data. Map this to // JSON as `null`. fn serialize_unit(self) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } self.output += "None"; Ok(()) } @@ -207,11 +212,6 @@ impl<'a> ser::Serializer for &'a mut Serializer { _variant_index: u32, variant: &'static str, ) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } // self.serialize_str(variant) self.output += variant; Ok(()) @@ -241,11 +241,6 @@ impl<'a> ser::Serializer for &'a mut Serializer { where T: ?Sized + Serialize, { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } // variant.serialize(&mut *self)?; self.output += variant; self.output += "("; @@ -265,12 +260,9 @@ impl<'a> ser::Serializer for &'a mut Serializer { // explicitly in the serialized form. Some serializers may only be able to // support sequences for which the length is known up front. fn serialize_seq(self, _len: Option) -> Result { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(self); - } self.output += "["; + self.level = std::cmp::min(self.max_depth- 1, self.level + 1); + self.num_elements[self.level] = 0; Ok(self) } @@ -279,12 +271,9 @@ impl<'a> ser::Serializer for &'a mut Serializer { // means that the corresponding `Deserialize implementation will know the // length without needing to look at the serialized data. fn serialize_tuple(self, _len: usize) -> Result { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(self); - } self.output += "("; + self.level = std::cmp::min(self.max_depth- 1, self.level + 1); + self.num_elements[self.level] = 0; Ok(self) } @@ -306,26 +295,19 @@ impl<'a> ser::Serializer for &'a mut Serializer { variant: &'static str, _len: usize, ) -> Result { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(self); - } // variant.serialize(&mut *self)?; self.output += variant; self.output += "("; + self.level = std::cmp::min(self.max_depth- 1, self.level + 1); + self.num_elements[self.level] = 0; Ok(self) } // Maps are represented in JSON as `{ K: V, K: V, ... }`. fn serialize_map(self, _len: Option) -> Result { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(self); - } - println!("Serialize map"); self.output += "{"; + self.level = std::cmp::min(self.max_depth- 1, self.level + 1); + self.num_elements[self.level] = 0; Ok(self) } @@ -335,11 +317,6 @@ impl<'a> ser::Serializer for &'a mut Serializer { // Deserialize implementation is required to know what the keys are without // looking at the serialized data. fn serialize_struct(self, name: &'static str, _len: usize) -> Result { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(self); - } // self.serialize_map(Some(len)) // name.serialize(&mut *self)?; if let Some(stripped) = name.strip_suffix("Helper") { @@ -348,6 +325,8 @@ impl<'a> ser::Serializer for &'a mut Serializer { self.output += name } self.output += "("; + self.level = std::cmp::min(self.max_depth- 1, self.level + 1); + self.num_elements[self.level] = 0; Ok(self) } @@ -360,14 +339,11 @@ impl<'a> ser::Serializer for &'a mut Serializer { variant: &'static str, _len: usize, ) -> Result { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(self); - } // variant.serialize(&mut *self)?; self.output += variant; self.output += "("; + self.level = std::cmp::min(self.max_depth- 1, self.level + 1); + self.num_elements[self.level] = 0; Ok(self) } } @@ -390,24 +366,25 @@ impl<'a> ser::SerializeSeq for &'a mut Serializer { where T: ?Sized + Serialize, { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } - if !self.output.ends_with('[') { - self.output += ", "; + self.num_elements[self.level] += 1; + let num_elements = self.num_elements[self.level]; + if num_elements < self.max_elements{ + if !self.output.ends_with('[') { + self.output += ", "; + } + value.serialize(&mut **self) + }else{ + if num_elements == self.max_elements{ + self.output += ", ..."; + } + Ok(()) } - value.serialize(&mut **self) } // Close the sequence. fn end(self) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } + self.num_elements[self.level] = 0; + self.level = self.level.saturating_sub(1); self.output += "]"; Ok(()) } @@ -422,23 +399,24 @@ impl<'a> ser::SerializeTuple for &'a mut Serializer { where T: ?Sized + Serialize, { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } - if !self.output.ends_with('(') { - self.output += ", "; + self.num_elements[self.level] += 1; + let num_elements = self.num_elements[self.level]; + if num_elements < self.max_elements{ + if !self.output.ends_with('(') { + self.output += ", "; + } + value.serialize(&mut **self) + }else{ + if num_elements == self.max_elements{ + self.output += ", ..."; + } + Ok(()) } - value.serialize(&mut **self) } fn end(self) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } + self.num_elements[self.level] = 0; + self.level = self.level.saturating_sub(1); self.output += ")"; Ok(()) } @@ -453,23 +431,24 @@ impl<'a> ser::SerializeTupleStruct for &'a mut Serializer { where T: ?Sized + Serialize, { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } - if !self.output.ends_with('(') { - self.output += ", "; + self.num_elements[self.level] += 1; + let num_elements = self.num_elements[self.level]; + if num_elements < self.max_elements{ + if !self.output.ends_with('(') { + self.output += ", "; + } + value.serialize(&mut **self) + }else{ + if num_elements == self.max_elements{ + self.output += ", ..."; + } + Ok(()) } - value.serialize(&mut **self) } fn end(self) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } + self.num_elements[self.level] = 0; + self.level = self.level.saturating_sub(1); self.output += ")"; Ok(()) } @@ -492,23 +471,24 @@ impl<'a> ser::SerializeTupleVariant for &'a mut Serializer { where T: ?Sized + Serialize, { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } - if !self.output.ends_with('(') { - self.output += ", "; + self.num_elements[self.level] += 1; + let num_elements = self.num_elements[self.level]; + if num_elements < self.max_elements{ + if !self.output.ends_with('(') { + self.output += ", "; + } + value.serialize(&mut **self) + }else{ + if num_elements == self.max_elements{ + self.output += ", ..."; + } + Ok(()) } - value.serialize(&mut **self) } fn end(self) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } + self.num_elements[self.level] = 0; + self.level = self.level.saturating_sub(1); self.output += ")"; Ok(()) } @@ -538,15 +518,19 @@ impl<'a> ser::SerializeMap for &'a mut Serializer { where T: ?Sized + Serialize, { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } - if !self.output.ends_with('{') { - self.output += ", "; + self.num_elements[self.level] += 1; + let num_elements = self.num_elements[self.level]; + if num_elements < self.max_elements{ + if !self.output.ends_with('{') { + self.output += ", "; + } + key.serialize(&mut **self) + }else{ + if num_elements == self.max_elements{ + self.output += ", ..."; + } + Ok(()) } - key.serialize(&mut **self) } // It doesn't make a difference whether the colon is printed at the end of @@ -556,21 +540,18 @@ impl<'a> ser::SerializeMap for &'a mut Serializer { where T: ?Sized + Serialize, { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); + let num_elements = self.num_elements[self.level]; + if num_elements < self.max_elements{ + self.output += ":"; + value.serialize(&mut **self) + }else{ + Ok(()) } - self.output += ":"; - value.serialize(&mut **self) } fn end(self) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } + self.num_elements[self.level] = 0; + self.level = self.level.saturating_sub(1); self.output += "}"; Ok(()) } @@ -586,11 +567,6 @@ impl<'a> ser::SerializeStruct for &'a mut Serializer { where T: ?Sized + Serialize, { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } if !self.output.ends_with('(') { self.output += ", "; } @@ -605,11 +581,8 @@ impl<'a> ser::SerializeStruct for &'a mut Serializer { } fn end(self) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } + self.num_elements[self.level] = 0; + self.level = self.level.saturating_sub(1); self.output += ")"; Ok(()) } @@ -625,11 +598,6 @@ impl<'a> ser::SerializeStructVariant for &'a mut Serializer { where T: ?Sized + Serialize, { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } if !self.output.ends_with('(') { self.output += ", "; } @@ -640,11 +608,8 @@ impl<'a> ser::SerializeStructVariant for &'a mut Serializer { } fn end(self) -> Result<()> { - self.level += 1; - if self.level > MAX_DEPTH { - self.output += "..."; - return Ok(()); - } + self.num_elements[self.level] = 0; + self.level = self.level.saturating_sub(1); self.output += ")"; Ok(()) } @@ -674,7 +639,7 @@ fn test_struct() { let expected = r#"Test(int=1, seq=["a", "b"])"#; assert_eq!(to_string(&test).unwrap(), expected); } -/* + #[test] fn test_enum() { #[derive(Serialize)] @@ -806,4 +771,3 @@ fn test_flatten() { let expected = r#"A(a=True, b=1)"#; assert_eq!(to_string(&u).unwrap(), expected); } -*/ \ No newline at end of file diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py index 4434e6304..d39282d67 100644 --- a/bindings/python/tests/test_serialization.py +++ b/bindings/python/tests/test_serialization.py @@ -5,6 +5,7 @@ import tqdm from huggingface_hub import hf_hub_download from tokenizers import Tokenizer +from tokenizers.models import BPE, Unigram from .utils import albert_base, data_dir @@ -16,6 +17,46 @@ def test_full_serialization_albert(self, albert_base): # file exceeds the buffer capacity Tokenizer.from_file(albert_base) + def test_str_big(self, albert_base): + tokenizer = Tokenizer.from_file(albert_base) + assert str(tokenizer) == """Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":1, "content":"", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":2, "content":"[CLS]", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":3, "content":"[SEP]", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":4, "content":"[MASK]", "single_word":False, "lstrip":False, "rstrip":False, ...}], normalizer=Sequence(normalizers=[Replace(pattern=String("``"), content="\""), Replace(pattern=String("''"), content="\""), NFKD(), StripAccents(), Lowercase(), ...]), pre_tokenizer=Sequence(pretokenizers=[WhitespaceSplit(), Metaspace(replacement="▁", prepend_scheme=always, split=True)]), post_processor=TemplateProcessing(single=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0)], pair=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0), Sequence(id=B, type_id=1), SpecialToken(id="[SEP]", type_id=1)], special_tokens={"[CLS]":SpecialToken(id="[CLS]", ids=[2], tokens=["[CLS]"]), "[SEP]":SpecialToken(id="[SEP]", ids=[3], tokens=["[SEP]"])}), decoder=Metaspace(replacement="▁", prepend_scheme=always, split=True), model=Unigram(unk_id=1, vocab=[("", 0), ("", 0), ("[CLS]", 0), ("[SEP]", 0), ("[MASK]", 0), ...], byte_fallback=False))""" + + def test_repr_str(self): + tokenizer = Tokenizer(BPE()) + tokenizer.add_tokens(["my"]) + assert repr(tokenizer) == """Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"my", "single_word":False, "lstrip":False, "rstrip":False, "normalized":True, "special":False}], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))""" + assert str(tokenizer) == """Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"my", "single_word":False, "lstrip":False, "rstrip":False, ...}], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))""" + + def test_repr_str_ellipsis(self): + model = BPE() + assert repr(model) == """BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[])""" + assert str(model) == """BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[])""" + + vocab = [ + ("A", 0.0), + ("B", -0.01), + ("C", -0.02), + ("D", -0.03), + ("E", -0.04), + ] + # No ellispsis yet + model = Unigram(vocab, 0, byte_fallback=False) + assert repr(model) == """Unigram(unk_id=0, vocab=[("A", 0), ("B", -0.01), ("C", -0.02), ("D", -0.03), ("E", -0.04)], byte_fallback=False)""" + assert str(model) == """Unigram(unk_id=0, vocab=[("A", 0), ("B", -0.01), ("C", -0.02), ("D", -0.03), ("E", -0.04)], byte_fallback=False)""" + + # Ellispis for longer than 5 elements only on `str`. + vocab = [ + ("A", 0.0), + ("B", -0.01), + ("C", -0.02), + ("D", -0.03), + ("E", -0.04), + ("F", -0.04), + ] + model = Unigram(vocab, 0, byte_fallback=False) + assert repr(model) == """Unigram(unk_id=0, vocab=[("A", 0), ("B", -0.01), ("C", -0.02), ("D", -0.03), ("E", -0.04), ("F", -0.04)], byte_fallback=False)""" + assert str(model) == """Unigram(unk_id=0, vocab=[("A", 0), ("B", -0.01), ("C", -0.02), ("D", -0.03), ("E", -0.04), ...], byte_fallback=False)""" + def check(tokenizer_file) -> bool: with open(tokenizer_file, "r") as f: