From b77e69862a8ad2f2a869e965e3392d18e3805815 Mon Sep 17 00:00:00 2001 From: Arthur Date: Sat, 7 Oct 2023 08:46:51 +0200 Subject: [PATCH 01/47] nits --- tokenizers/src/pre_tokenizers/metaspace.rs | 6 ++++-- tokenizers/src/tokenizer/mod.rs | 5 +++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index ad4df5afb..6a03dd978 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -65,12 +65,14 @@ impl Default for Metaspace { impl PreTokenizer for Metaspace { fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { + let mut first_split = true; + pretokenized.split(|_, mut normalized| { normalized.replace(' ', &self.str_rep)?; - if self.add_prefix_space && !normalized.get().starts_with(self.replacement) { + if self.add_prefix_space && first_split { normalized.prepend(&self.str_rep); + first_split = false; // Set the flag to false after the first split } - normalized.split(self.replacement, SplitDelimiterBehavior::MergedWithNext) }) } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 915fe7bfa..087f0c396 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -658,6 +658,10 @@ where final_vocab } + /// Get the added tokens encoder + pub fn get_added_tokens_encoder(&self) -> HashMap { + self.added_vocabulary.get_vocab().clone() + } /// Get the added tokens decoder pub fn get_added_tokens_decoder(&self) -> HashMap { @@ -693,6 +697,7 @@ where offsets_type: OffsetType, ) -> Result { let encode = |is_pre_tokenized, subseq_idx, subseq| -> Result { + // FIXME FIXME FIXME all of our SPM issues most probably come from here. The addition of the space after special tokens let normalized = self .added_vocabulary .extract_and_normalize(self.normalizer.as_ref(), subseq); From e7ca464a2e432dbbe5933d3abd8f1a4e84d8ce01 Mon Sep 17 00:00:00 2001 From: Arthur Date: Sat, 7 Oct 2023 10:54:46 +0200 Subject: [PATCH 02/47] allow for legacy beahaviour without making any breaking changes --- bindings/python/src/pre_tokenizers.rs | 10 ++++++++++ tokenizers/src/pre_tokenizers/metaspace.rs | 12 +++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 699d0bbdc..4c0f026e7 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -489,6 +489,16 @@ impl PyMetaspace { setter!(self_, Metaspace, add_prefix_space, add_prefix_space); } + #[getter] + fn get_use_legacy(self_: PyRef) -> bool { + getter!(self_, Metaspace, legacy) + } + + #[setter] + fn set_use_legacy(self_: PyRef, legacy: bool) { + setter!(self_, Metaspace, legacy, legacy); + } + #[new] #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")] fn new( diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 6a03dd978..873944c15 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -9,6 +9,7 @@ use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitD pub struct Metaspace { replacement: char, pub add_prefix_space: bool, + pub legacy:bool, #[serde(skip)] str_rep: String, } @@ -29,6 +30,7 @@ impl<'de> Deserialize<'de> for Metaspace { _type: Type, replacement: char, pub add_prefix_space: bool, + pub legacy: bool, #[serde(skip, rename = "str_rep")] _str_rep: String, } @@ -44,6 +46,7 @@ impl Metaspace { replacement, str_rep: replacement.to_string(), add_prefix_space, + legacy: false, } } @@ -69,9 +72,12 @@ impl PreTokenizer for Metaspace { pretokenized.split(|_, mut normalized| { normalized.replace(' ', &self.str_rep)?; - if self.add_prefix_space && first_split { - normalized.prepend(&self.str_rep); - first_split = false; // Set the flag to false after the first split + if self.add_prefix_space{ + if self.legacy{normalized.prepend(&self.str_rep);} + else if self.add_prefix_space && first_split{ + normalized.prepend(&self.str_rep); + first_split = false; // Set the flag to false after the first split + } } normalized.split(self.replacement, SplitDelimiterBehavior::MergedWithNext) }) From b18b8d599d7dff521d7fce2526a630b8796a9dbc Mon Sep 17 00:00:00 2001 From: Arthur Date: Sat, 7 Oct 2023 10:55:36 +0200 Subject: [PATCH 03/47] add a todo --- tokenizers/src/pre_tokenizers/metaspace.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 873944c15..90096bbc6 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -30,6 +30,7 @@ impl<'de> Deserialize<'de> for Metaspace { _type: Type, replacement: char, pub add_prefix_space: bool, + // TODO Make sure legacy does not have to be here for forward compatibility! pub legacy: bool, #[serde(skip, rename = "str_rep")] _str_rep: String, From f634484aeaa30a02988278dcee64d17294da112a Mon Sep 17 00:00:00 2001 From: Arthur Date: Sat, 7 Oct 2023 11:00:59 +0200 Subject: [PATCH 04/47] set to legacy by default --- tokenizers/src/pre_tokenizers/metaspace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 90096bbc6..832ab173a 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -47,7 +47,7 @@ impl Metaspace { replacement, str_rep: replacement.to_string(), add_prefix_space, - legacy: false, + legacy: true, } } From f6bcb30009bd857301646c76ee37cf57214948a4 Mon Sep 17 00:00:00 2001 From: Arthur Date: Sat, 7 Oct 2023 12:57:10 +0200 Subject: [PATCH 05/47] skip legacy serialization --- tokenizers/src/pre_tokenizers/metaspace.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 832ab173a..8fb84c57e 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -9,11 +9,17 @@ use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitD pub struct Metaspace { replacement: char, pub add_prefix_space: bool, - pub legacy:bool, + #[serde(skip_serializing_if = "skip_legacy_serialization")] + pub legacy: bool, #[serde(skip)] str_rep: String, } +fn skip_legacy_serialization(legacy: &bool) -> bool { + *legacy // Skip serialization if legacy is true +} + + impl<'de> Deserialize<'de> for Metaspace { fn deserialize(deserializer: D) -> std::result::Result where From 08b6ff7a3e05938458c75c029f96c521d3203e9c Mon Sep 17 00:00:00 2001 From: Arthur Date: Sat, 7 Oct 2023 12:57:21 +0200 Subject: [PATCH 06/47] push correct update --- bindings/python/src/pre_tokenizers.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 4c0f026e7..200e5f236 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -490,12 +490,12 @@ impl PyMetaspace { } #[getter] - fn get_use_legacy(self_: PyRef) -> bool { + fn get_legacy(self_: PyRef) -> bool { getter!(self_, Metaspace, legacy) } #[setter] - fn set_use_legacy(self_: PyRef, legacy: bool) { + fn set_legacy(self_: PyRef, legacy: bool) { setter!(self_, Metaspace, legacy, legacy); } From 2327cd1a3a93f6a5fee4302527367852065c1b2b Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 15:29:58 +0100 Subject: [PATCH 07/47] lint --- tokenizers/src/pre_tokenizers/metaspace.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 8fb84c57e..0b3ba3b8f 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -19,7 +19,6 @@ fn skip_legacy_serialization(legacy: &bool) -> bool { *legacy // Skip serialization if legacy is true } - impl<'de> Deserialize<'de> for Metaspace { fn deserialize(deserializer: D) -> std::result::Result where @@ -79,9 +78,10 @@ impl PreTokenizer for Metaspace { pretokenized.split(|_, mut normalized| { normalized.replace(' ', &self.str_rep)?; - if self.add_prefix_space{ - if self.legacy{normalized.prepend(&self.str_rep);} - else if self.add_prefix_space && first_split{ + if self.add_prefix_space { + if self.legacy { + normalized.prepend(&self.str_rep); + } else if self.add_prefix_space && first_split { normalized.prepend(&self.str_rep); first_split = false; // Set the flag to false after the first split } From 4f82b0d427faacc8619e7d1d38ee73c75c18ff5a Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 16:28:55 +0100 Subject: [PATCH 08/47] add deserialization test --- tokenizers/src/pre_tokenizers/mod.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 0ec7f9f59..3e503ac13 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -104,6 +104,19 @@ mod tests { PreTokenizerWrapper::Metaspace(Metaspace::new('▁', true)) ])) ); + + + let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str( + r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":true}"#, + ) + .unwrap(); + + let mut expected_pre_tokenizer = Metaspace::new('▁', true); + expected_pre_tokenizer.legacy = false; + assert_eq!( + pre_tokenizer, + PreTokenizerWrapper::Metaspace(expected_pre_tokenizer) + ); } #[test] From 3e0f2be71bcdabd6db7d1a28774297a2664720be Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 16:29:45 +0100 Subject: [PATCH 09/47] add a python test as well --- bindings/python/tests/bindings/test_pre_tokenizers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index 90b468c12..a2b30e450 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -110,6 +110,9 @@ def test_can_modify(self): assert pretok.replacement == "%" pretok.add_prefix_space = True assert pretok.add_prefix_space == True + pretok.legacy = False + assert pretok.legacy == False + class TestCharDelimiterSplit: From 0ef1d6f728ebde69adc0b2b185d9a2bfc8b4092f Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 16:52:22 +0100 Subject: [PATCH 10/47] updates --- tokenizers/src/decoders/mod.rs | 1 - tokenizers/src/pre_tokenizers/metaspace.rs | 5 +++++ tokenizers/src/pre_tokenizers/mod.rs | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs index 94204b8f1..e8d8758d3 100644 --- a/tokenizers/src/decoders/mod.rs +++ b/tokenizers/src/decoders/mod.rs @@ -78,7 +78,6 @@ mod tests { let serialized = serde_json::to_string(&decoder).unwrap(); assert_eq!(serialized, json); } - #[test] fn decoder_serialization_other_no_arg() { let json = r#"{"type":"Sequence","decoders":[{"type":"Fuse"},{"type":"Metaspace","replacement":"▁","add_prefix_space":true}]}"#; diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 0b3ba3b8f..9392bea71 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -19,6 +19,10 @@ fn skip_legacy_serialization(legacy: &bool) -> bool { *legacy // Skip serialization if legacy is true } +fn default_legacy_value() -> bool { + true +} + impl<'de> Deserialize<'de> for Metaspace { fn deserialize(deserializer: D) -> std::result::Result where @@ -36,6 +40,7 @@ impl<'de> Deserialize<'de> for Metaspace { replacement: char, pub add_prefix_space: bool, // TODO Make sure legacy does not have to be here for forward compatibility! + #[serde(default = "default_legacy_value")] pub legacy: bool, #[serde(skip, rename = "str_rep")] _str_rep: String, diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 3e503ac13..03a7e2ac9 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -107,7 +107,7 @@ mod tests { let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str( - r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":true}"#, + r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":false}"#, ) .unwrap(); From ed24483cce8dd870cb102a3297be5a568eedda44 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 17:00:35 +0100 Subject: [PATCH 11/47] fix serialization tests --- tokenizers/src/pre_tokenizers/metaspace.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 9392bea71..0313ccffb 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -47,7 +47,9 @@ impl<'de> Deserialize<'de> for Metaspace { } let helper = MetaspaceHelper::deserialize(deserializer)?; - Ok(Self::new(helper.replacement, helper.add_prefix_space)) + let mut instance = Self::new(helper.replacement, helper.add_prefix_space); + instance.legacy = helper.legacy; + Ok(instance) } } From ba5a2840ab6e383e5050a693f4a4b652cb9605c8 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 17:01:06 +0100 Subject: [PATCH 12/47] nits --- tokenizers/src/pre_tokenizers/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 03a7e2ac9..d36587b79 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -105,7 +105,6 @@ mod tests { ])) ); - let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str( r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":false}"#, ) From b2bb369bab153dd5992b36f72bae0cfae08b2915 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 17:25:23 +0100 Subject: [PATCH 13/47] python stylijng of the tests --- bindings/python/tests/bindings/test_pre_tokenizers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index a2b30e450..5b4d236c5 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -114,7 +114,6 @@ def test_can_modify(self): assert pretok.legacy == False - class TestCharDelimiterSplit: def test_instantiate(self): assert CharDelimiterSplit("-") is not None From 948d2dd3e2d91391887353261716a910c66e645b Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 21:38:27 +0100 Subject: [PATCH 14/47] better tests --- tokenizers/src/pre_tokenizers/metaspace.rs | 38 ++++++++++++++++++++-- tokenizers/src/pre_tokenizers/mod.rs | 10 ++++++ tokenizers/src/tokenizer/mod.rs | 1 - 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 0313ccffb..102b6f60a 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -1,6 +1,7 @@ use serde::{Deserialize, Deserializer, Serialize}; use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; +use regex::Regex; #[derive(Debug, Clone, PartialEq, Serialize, Eq)] /// Replaces all the whitespaces by the provided meta character and then @@ -85,10 +86,10 @@ impl PreTokenizer for Metaspace { pretokenized.split(|_, mut normalized| { normalized.replace(' ', &self.str_rep)?; - if self.add_prefix_space { + if self.add_prefix_space && !normalized.get().starts_with(self.replacement){ if self.legacy { normalized.prepend(&self.str_rep); - } else if self.add_prefix_space && first_split { + } else if first_split { normalized.prepend(&self.str_rep); first_split = false; // Set the flag to false after the first split } @@ -210,6 +211,39 @@ mod tests { ); } + #[test] + fn non_legacy_meta_space(){ + let mut pretok = Metaspace::new('▁', true); + pretok.legacy = false; + let mut pretokenized = PreTokenizedString::from("Hey my friend how▁are you"); + let re_ref = Regex::new(r"()").unwrap(); + pretokenized.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)).expect("AddedVocabulary bad split"); + println!("{:?}",pretokenized); + + pretok.pre_tokenize(&mut pretokenized).unwrap(); + assert_eq!( + pretokenized + .get_splits(OffsetReferential::Normalized, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(), + vec![ + ("▁Hey", (0, 6)), ("▁my", (6, 11)), ("▁friend", (11, 20)), ("▁", (20, 23)), ("", (23, 26)), ("how", (26, 29)), ("▁are", (29, 35)), ("▁you", (35, 41)) + ] + ); + pretok.legacy = true; + pretok.pre_tokenize(&mut pretokenized).unwrap(); + assert_eq!( + pretokenized + .get_splits(OffsetReferential::Normalized, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(), + vec![ + ("▁Hey", (0, 6)), ("▁my", (6, 11)), ("▁friend", (11, 20)), ("▁", (20, 23)), ("▁", (23, 26)), ("▁how", (26, 29)), ("▁are", (29, 35)), ("▁you", (35, 41)) + ] + ); + } #[test] fn decode() { let decoder = Metaspace::new('▁', true); diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index d36587b79..90287d035 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -116,6 +116,16 @@ mod tests { pre_tokenizer, PreTokenizerWrapper::Metaspace(expected_pre_tokenizer) ); + + let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str( + r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":true}"#, + ) + .unwrap(); + + assert_eq!( + pre_tokenizer, + PreTokenizerWrapper::Metaspace(Metaspace::new('▁', true)) + ); } #[test] diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 9c2f4da32..d939fc6ce 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -697,7 +697,6 @@ where offsets_type: OffsetType, ) -> Result { let encode = |is_pre_tokenized, subseq_idx, subseq| -> Result { - // FIXME FIXME FIXME all of our SPM issues most probably come from here. The addition of the space after special tokens let normalized = self .added_vocabulary .extract_and_normalize(self.normalizer.as_ref(), subseq); From dbe25cd6451f274430f1fde22767b424bbb30bb0 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 21:41:00 +0100 Subject: [PATCH 15/47] fix offsets --- tokenizers/src/pre_tokenizers/metaspace.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 102b6f60a..d3fcf3097 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -1,7 +1,6 @@ use serde::{Deserialize, Deserializer, Serialize}; use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; -use regex::Regex; #[derive(Debug, Clone, PartialEq, Serialize, Eq)] /// Replaces all the whitespaces by the provided meta character and then @@ -240,7 +239,7 @@ mod tests { .map(|(s, o, _)| (s, o)) .collect::>(), vec![ - ("▁Hey", (0, 6)), ("▁my", (6, 11)), ("▁friend", (11, 20)), ("▁", (20, 23)), ("▁", (23, 26)), ("▁how", (26, 29)), ("▁are", (29, 35)), ("▁you", (35, 41)) + ("▁Hey", (0, 6)), ("▁my", (6, 11)), ("▁friend", (11, 20)), ("▁", (20, 23)), ("▁", (23, 29)), ("▁how", (29, 35)), ("▁are", (35, 41)), ("▁you", (41, 47)) ] ); } From 430966742ca7101cdaa6f975e342076e468e3151 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 21:42:15 +0100 Subject: [PATCH 16/47] fix imports --- tokenizers/src/pre_tokenizers/metaspace.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index d3fcf3097..b11be137a 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -125,6 +125,8 @@ impl Decoder for Metaspace { #[cfg(test)] mod tests { + use regex::Regex; + use super::*; use crate::{OffsetReferential, OffsetType}; From ca048d5d393ea79a7beec8e2be81379ccbd58ed7 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 21:42:26 +0100 Subject: [PATCH 17/47] fmt --- tokenizers/src/pre_tokenizers/metaspace.rs | 28 +++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index b11be137a..38befa751 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -85,7 +85,7 @@ impl PreTokenizer for Metaspace { pretokenized.split(|_, mut normalized| { normalized.replace(' ', &self.str_rep)?; - if self.add_prefix_space && !normalized.get().starts_with(self.replacement){ + if self.add_prefix_space && !normalized.get().starts_with(self.replacement) { if self.legacy { normalized.prepend(&self.str_rep); } else if first_split { @@ -213,13 +213,15 @@ mod tests { } #[test] - fn non_legacy_meta_space(){ + fn non_legacy_meta_space() { let mut pretok = Metaspace::new('▁', true); pretok.legacy = false; let mut pretokenized = PreTokenizedString::from("Hey my friend how▁are you"); let re_ref = Regex::new(r"()").unwrap(); - pretokenized.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)).expect("AddedVocabulary bad split"); - println!("{:?}",pretokenized); + pretokenized + .split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)) + .expect("AddedVocabulary bad split"); + println!("{:?}", pretokenized); pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( @@ -229,7 +231,14 @@ mod tests { .map(|(s, o, _)| (s, o)) .collect::>(), vec![ - ("▁Hey", (0, 6)), ("▁my", (6, 11)), ("▁friend", (11, 20)), ("▁", (20, 23)), ("", (23, 26)), ("how", (26, 29)), ("▁are", (29, 35)), ("▁you", (35, 41)) + ("▁Hey", (0, 6)), + ("▁my", (6, 11)), + ("▁friend", (11, 20)), + ("▁", (20, 23)), + ("", (23, 26)), + ("how", (26, 29)), + ("▁are", (29, 35)), + ("▁you", (35, 41)) ] ); pretok.legacy = true; @@ -241,7 +250,14 @@ mod tests { .map(|(s, o, _)| (s, o)) .collect::>(), vec![ - ("▁Hey", (0, 6)), ("▁my", (6, 11)), ("▁friend", (11, 20)), ("▁", (20, 23)), ("▁", (23, 29)), ("▁how", (29, 35)), ("▁are", (35, 41)), ("▁you", (41, 47)) + ("▁Hey", (0, 6)), + ("▁my", (6, 11)), + ("▁friend", (11, 20)), + ("▁", (20, 23)), + ("▁", (23, 29)), + ("▁how", (29, 35)), + ("▁are", (35, 41)), + ("▁you", (41, 47)) ] ); } From 57052629fab88b42757186aa7d8574a65a108a3e Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 22:15:24 +0100 Subject: [PATCH 18/47] update metaspace --- tokenizers/src/pre_tokenizers/metaspace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 38befa751..f44fa12be 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -220,7 +220,7 @@ mod tests { let re_ref = Regex::new(r"()").unwrap(); pretokenized .split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)) - .expect("AddedVocabulary bad split"); + .expect("Bad split"); println!("{:?}", pretokenized); pretok.pre_tokenize(&mut pretokenized).unwrap(); From 2b1635375739837814996bb4e34ebc32f20a15b7 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 10 Nov 2023 22:16:05 +0100 Subject: [PATCH 19/47] remove TODO --- tokenizers/src/pre_tokenizers/metaspace.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index f44fa12be..b6d8e56bf 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -39,7 +39,6 @@ impl<'de> Deserialize<'de> for Metaspace { _type: Type, replacement: char, pub add_prefix_space: bool, - // TODO Make sure legacy does not have to be here for forward compatibility! #[serde(default = "default_legacy_value")] pub legacy: bool, #[serde(skip, rename = "str_rep")] From eaf24bb6ea8716b6f023300d24a3aefee8a8e9a2 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 13 Nov 2023 17:03:44 +0100 Subject: [PATCH 20/47] use enm --- .../tests/bindings/test_pre_tokenizers.py | 4 +- tokenizers/src/pre_tokenizers/metaspace.rs | 106 ++++++++++++++---- tokenizers/src/pre_tokenizers/mod.rs | 5 +- 3 files changed, 88 insertions(+), 27 deletions(-) diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index 5b4d236c5..daf827b26 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -110,8 +110,8 @@ def test_can_modify(self): assert pretok.replacement == "%" pretok.add_prefix_space = True assert pretok.add_prefix_space == True - pretok.legacy = False - assert pretok.legacy == False + pretok.prepend_scheme = "never" + assert pretok.prepend_scheme == "never" class TestCharDelimiterSplit: diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index b6d8e56bf..bb387f3df 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -1,7 +1,36 @@ -use serde::{Deserialize, Deserializer, Serialize}; +use serde::{de::Error, Deserialize, Deserializer, Serialize}; use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; +/// Enum representing options for the metaspace prepending scheme. +#[derive(Debug, Clone, PartialEq, Serialize, Eq, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum PrependScheme { + /// Specifies that the scheme should be prepended only once, on the first split. + First, + /// Specifies that the space should not be prepended. + Never, + /// Specifies that the scheme should always be prepended. + Always, +} + +impl From<&str> for PrependScheme { + fn from(s: &str) -> Self { + match s.to_lowercase().as_str() { + "first" => PrependScheme::First, + "never" => PrependScheme::Never, + "always" => PrependScheme::Always, + _ => panic!("Invalid value for PrependScheme: {}", s), + } + } +} + +impl From for PrependScheme { + fn from(s: String) -> Self { + PrependScheme::from(s.as_str()) + } +} + #[derive(Debug, Clone, PartialEq, Serialize, Eq)] /// Replaces all the whitespaces by the provided meta character and then /// splits on this character @@ -9,20 +38,11 @@ use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitD pub struct Metaspace { replacement: char, pub add_prefix_space: bool, - #[serde(skip_serializing_if = "skip_legacy_serialization")] - pub legacy: bool, + pub prepend_scheme: PrependScheme, #[serde(skip)] str_rep: String, } -fn skip_legacy_serialization(legacy: &bool) -> bool { - *legacy // Skip serialization if legacy is true -} - -fn default_legacy_value() -> bool { - true -} - impl<'de> Deserialize<'de> for Metaspace { fn deserialize(deserializer: D) -> std::result::Result where @@ -33,21 +53,28 @@ impl<'de> Deserialize<'de> for Metaspace { Metaspace, } + fn default_prepend_scheme_value() -> PrependScheme { + PrependScheme::Always + } + #[derive(Deserialize)] pub struct MetaspaceHelper { #[serde(rename = "type")] _type: Type, replacement: char, pub add_prefix_space: bool, - #[serde(default = "default_legacy_value")] - pub legacy: bool, + #[serde(default = "default_prepend_scheme_value")] + pub prepend_scheme: PrependScheme, #[serde(skip, rename = "str_rep")] _str_rep: String, } let helper = MetaspaceHelper::deserialize(deserializer)?; - let mut instance = Self::new(helper.replacement, helper.add_prefix_space); - instance.legacy = helper.legacy; + let instance = Self::new_with_prepend_scheme( + helper.replacement, + helper.add_prefix_space, + helper.prepend_scheme, + ); Ok(instance) } } @@ -58,7 +85,20 @@ impl Metaspace { replacement, str_rep: replacement.to_string(), add_prefix_space, - legacy: true, + prepend_scheme: PrependScheme::Always, // always prepend for legacy purpose + } + } + + pub fn new_with_prepend_scheme( + replacement: char, + add_prefix_space: bool, + prepend_scheme: PrependScheme, + ) -> Self { + Self { + replacement, + str_rep: replacement.to_string(), + add_prefix_space, + prepend_scheme: prepend_scheme, } } @@ -70,6 +110,9 @@ impl Metaspace { self.replacement = replacement; self.str_rep = replacement.to_string(); } + pub fn set_prepend_scheme(&mut self, scheme: impl Into){ + self.prepend_scheme = scheme.into(); + } } impl Default for Metaspace { @@ -85,13 +128,16 @@ impl PreTokenizer for Metaspace { pretokenized.split(|_, mut normalized| { normalized.replace(' ', &self.str_rep)?; if self.add_prefix_space && !normalized.get().starts_with(self.replacement) { - if self.legacy { + if self.prepend_scheme == PrependScheme::Always { normalized.prepend(&self.str_rep); - } else if first_split { + } else if self.prepend_scheme == PrependScheme::First && first_split { normalized.prepend(&self.str_rep); - first_split = false; // Set the flag to false after the first split + first_split = false; } + } else { + first_split = false; } + normalized.split(self.replacement, SplitDelimiterBehavior::MergedWithNext) }) } @@ -214,13 +260,29 @@ mod tests { #[test] fn non_legacy_meta_space() { let mut pretok = Metaspace::new('▁', true); - pretok.legacy = false; + pretok.set_prepend_scheme("always".to_string()); + assert_eq!( + pretok, + Metaspace::new_with_prepend_scheme('▁', true, PrependScheme::Always) + ); + + pretok.set_prepend_scheme("never".to_string()); + assert_eq!( + pretok, + Metaspace::new_with_prepend_scheme('▁', true, PrependScheme::Never) + ); + + pretok.set_prepend_scheme("first".to_string()); + assert_eq!( + pretok, + Metaspace::new_with_prepend_scheme('▁', true, PrependScheme::First) + ); + let mut pretokenized = PreTokenizedString::from("Hey my friend how▁are you"); let re_ref = Regex::new(r"()").unwrap(); pretokenized .split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)) .expect("Bad split"); - println!("{:?}", pretokenized); pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( @@ -240,7 +302,7 @@ mod tests { ("▁you", (35, 41)) ] ); - pretok.legacy = true; + pretok.set_prepend_scheme("never"); pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( pretokenized diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 90287d035..b5d7672e7 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -106,19 +106,18 @@ mod tests { ); let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str( - r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":false}"#, + r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "prepend_scheme":"first"}"#, ) .unwrap(); let mut expected_pre_tokenizer = Metaspace::new('▁', true); - expected_pre_tokenizer.legacy = false; assert_eq!( pre_tokenizer, PreTokenizerWrapper::Metaspace(expected_pre_tokenizer) ); let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str( - r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":true}"#, + r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "prepend_scheme":"always"}"#, ) .unwrap(); From 3ec7b547fdb26f657c3a34a4cae3ed87fc172dca Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 13 Nov 2023 17:20:28 +0100 Subject: [PATCH 21/47] fix some tses --- tokenizers/src/pre_tokenizers/metaspace.rs | 49 +++++++++++++++++++++- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index bb387f3df..7502034f9 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -110,7 +110,7 @@ impl Metaspace { self.replacement = replacement; self.str_rep = replacement.to_string(); } - pub fn set_prepend_scheme(&mut self, scheme: impl Into){ + pub fn set_prepend_scheme(&mut self, scheme: impl Into) { self.prepend_scheme = scheme.into(); } } @@ -302,7 +302,7 @@ mod tests { ("▁you", (35, 41)) ] ); - pretok.set_prepend_scheme("never"); + pretok.set_prepend_scheme("always"); pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( pretokenized @@ -321,6 +321,51 @@ mod tests { ("▁you", (41, 47)) ] ); + + pretok.set_prepend_scheme("first"); + let mut pretokenized = PreTokenizedString::from(" Hey how"); // test with prefix + pretokenized + .split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)) + .expect("Bad split"); + pretok.pre_tokenize(&mut pretokenized).unwrap(); + assert_eq!( + pretokenized + .get_splits(OffsetReferential::Normalized, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(), + vec![ + ("▁Hey", (0, 6)), + ("▁", (6, 9)), + ("", (9, 12)), + ("how", (12, 15)) + ] + ); + + let mut pretokenized = PreTokenizedString::from(" Hey how are you"); // test with many splits + pretokenized + .split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)) + .expect("Bad split"); + pretok.pre_tokenize(&mut pretokenized).unwrap(); + assert_eq!( + pretokenized + .get_splits(OffsetReferential::Normalized, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(), + vec![ + ("▁Hey", (0, 6)), + ("▁", (6, 9)), + ("", (9, 12)), + ("how", (12, 15)), + ("▁", (15, 18)), + ("", (18, 21)), + ("are", (21, 24)), + ("▁", (24, 27)), + ("", (27, 30)), + ("▁you", (30, 36)) + ] + ); } #[test] fn decode() { From 34355e54ce43f46ad721bff6b1a3833c179817f3 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 13 Nov 2023 17:22:43 +0100 Subject: [PATCH 22/47] nits --- tokenizers/src/pre_tokenizers/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index b5d7672e7..fc8b9a0cf 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -110,10 +110,13 @@ mod tests { ) .unwrap(); - let mut expected_pre_tokenizer = Metaspace::new('▁', true); assert_eq!( pre_tokenizer, - PreTokenizerWrapper::Metaspace(expected_pre_tokenizer) + PreTokenizerWrapper::Metaspace(Metaspace::new_with_prepend_scheme( + '▁', + true, + metaspace::PrependScheme::First + )) ); let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str( From 8697912cbf5571aee7ddff1c4f5ed031c9555be3 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 08:15:38 +0100 Subject: [PATCH 23/47] use enum --- bindings/python/src/pre_tokenizers.rs | 29 +++++++++++++++------- tokenizers/src/pre_tokenizers/metaspace.rs | 13 +++++++++- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 200e5f236..458496882 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -11,7 +11,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer; use tk::pre_tokenizers::byte_level::ByteLevel; use tk::pre_tokenizers::delimiter::CharDelimiterSplit; use tk::pre_tokenizers::digits::Digits; -use tk::pre_tokenizers::metaspace::Metaspace; +use tk::pre_tokenizers::metaspace::{Metaspace}; use tk::pre_tokenizers::punctuation::Punctuation; use tk::pre_tokenizers::split::Split; use tk::pre_tokenizers::unicode_scripts::UnicodeScripts; @@ -490,13 +490,13 @@ impl PyMetaspace { } #[getter] - fn get_legacy(self_: PyRef) -> bool { - getter!(self_, Metaspace, legacy) + fn get_prepend_scheme(self_: PyRef) -> String { + getter!(self_, Metaspace, prepend_scheme.to_string()) } #[setter] - fn set_legacy(self_: PyRef, legacy: bool) { - setter!(self_, Metaspace, legacy, legacy); + fn set_prepend_scheme(self_: PyRef, prepend_scheme: String) { + setter!(self_, Metaspace, @set_prepend_scheme, prepend_scheme); } #[new] @@ -506,10 +506,21 @@ impl PyMetaspace { add_prefix_space: bool, _kwargs: Option<&PyDict>, ) -> (Self, PyPreTokenizer) { - ( - PyMetaspace {}, - Metaspace::new(replacement.0, add_prefix_space).into(), - ) + // Create a new Metaspace instance + let mut new_instance: Metaspace = Metaspace::new(replacement.0, add_prefix_space); + + // Extract values from _kwargs if present + let prepend_scheme: Option = _kwargs.and_then(|kwargs| { + kwargs.get_item("prepend_scheme").and_then(|prepend_scheme| prepend_scheme.extract().ok()) + }); + // Perform an action if prepend_scheme is present + if let Some(prepend_scheme_value) = prepend_scheme { + // Do something with prepend_scheme_value + new_instance.set_prepend_scheme(prepend_scheme_value); + } + + // Return the new Metaspace instance and PyPreTokenizer + (PyMetaspace {}, new_instance.into()) } } diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 7502034f9..f48318e5d 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -1,4 +1,4 @@ -use serde::{de::Error, Deserialize, Deserializer, Serialize}; +use serde::{Deserialize, Deserializer, Serialize}; use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; @@ -31,6 +31,17 @@ impl From for PrependScheme { } } +impl PrependScheme { + pub fn to_string(&self) -> String { + match self { + PrependScheme::First => "first".to_string(), + PrependScheme::Never => "never".to_string(), + PrependScheme::Always => "always".to_string(), + } + } +} + + #[derive(Debug, Clone, PartialEq, Serialize, Eq)] /// Replaces all the whitespaces by the provided meta character and then /// splits on this character From 562227be0245a1e8a64cba25095a346023369357 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 08:23:40 +0100 Subject: [PATCH 24/47] update tests --- tokenizers/src/decoders/mod.rs | 6 +++--- tokenizers/src/pre_tokenizers/metaspace.rs | 22 ++++++++++------------ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs index e8d8758d3..87beeca5b 100644 --- a/tokenizers/src/decoders/mod.rs +++ b/tokenizers/src/decoders/mod.rs @@ -73,14 +73,14 @@ mod tests { #[test] fn decoder_serialization() { - let json = r#"{"type":"Sequence","decoders":[{"type":"ByteFallback"},{"type":"Metaspace","replacement":"▁","add_prefix_space":true}]}"#; + let json = r#"{"type":"Sequence","decoders":[{"type":"ByteFallback"},{"type":"Metaspace","replacement":"▁","add_prefix_space":true,"prepend_scheme":"always"}]}"#; let decoder: DecoderWrapper = serde_json::from_str(json).unwrap(); let serialized = serde_json::to_string(&decoder).unwrap(); assert_eq!(serialized, json); } #[test] fn decoder_serialization_other_no_arg() { - let json = r#"{"type":"Sequence","decoders":[{"type":"Fuse"},{"type":"Metaspace","replacement":"▁","add_prefix_space":true}]}"#; + let json = r#"{"type":"Sequence","decoders":[{"type":"Fuse"},{"type":"Metaspace","replacement":"▁","add_prefix_space":true,"prepend_scheme":"always"}]}"#; let decoder: DecoderWrapper = serde_json::from_str(json).unwrap(); let serialized = serde_json::to_string(&decoder).unwrap(); assert_eq!(serialized, json); @@ -88,7 +88,7 @@ mod tests { #[test] fn decoder_serialization_no_decode() { - let json = r#"{"type":"Sequence","decoders":[{},{"type":"Metaspace","replacement":"▁","add_prefix_space":true}]}"#; + let json = r#"{"type":"Sequence","decoders":[{},{"type":"Metaspace","replacement":"▁","add_prefix_space":true,"prepend_scheme":"always"}]}"#; assert!(serde_json::from_str::(json).is_err()); } } diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index f48318e5d..7c85379e5 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -1,6 +1,6 @@ -use serde::{Deserialize, Deserializer, Serialize}; - use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; +use serde::{Deserialize, Deserializer, Serialize}; +use std::fmt; /// Enum representing options for the metaspace prepending scheme. #[derive(Debug, Clone, PartialEq, Serialize, Eq, Deserialize)] @@ -31,17 +31,16 @@ impl From for PrependScheme { } } -impl PrependScheme { - pub fn to_string(&self) -> String { +impl fmt::Display for PrependScheme { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - PrependScheme::First => "first".to_string(), - PrependScheme::Never => "never".to_string(), - PrependScheme::Always => "always".to_string(), + PrependScheme::First => write!(f, "first"), + PrependScheme::Never => write!(f, "never"), + PrependScheme::Always => write!(f, "always"), } } } - #[derive(Debug, Clone, PartialEq, Serialize, Eq)] /// Replaces all the whitespaces by the provided meta character and then /// splits on this character @@ -109,7 +108,7 @@ impl Metaspace { replacement, str_rep: replacement.to_string(), add_prefix_space, - prepend_scheme: prepend_scheme, + prepend_scheme, } } @@ -189,7 +188,7 @@ mod tests { #[test] fn serialization() { let metaspace = Metaspace::new('_', true); - let metaspace_s = r#"{"type":"Metaspace","replacement":"_","add_prefix_space":true}"#; + let metaspace_s = r#"{"type":"Metaspace","replacement":"_","add_prefix_space":true,"prepend_scheme":"always"}"#; assert_eq!(serde_json::to_string(&metaspace).unwrap(), metaspace_s); assert_eq!( serde_json::from_str::(metaspace_s).unwrap(), @@ -198,8 +197,7 @@ mod tests { // Also check it can deserialize previous versions let metaspace = Metaspace::new('_', true); - let metaspace_s = - r#"{"type":"Metaspace","str_rep":"_","replacement":"_","add_prefix_space":true}"#; + let metaspace_s = r#"{"type":"Metaspace","str_rep":"_","replacement":"_","add_prefix_space":true,"prepend_scheme":"always"}"#; assert_eq!( serde_json::from_str::(metaspace_s).unwrap(), metaspace From 4e714dbb186ca6a53d507b356ee54aea7c158702 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 08:24:10 +0100 Subject: [PATCH 25/47] syling --- bindings/python/src/pre_tokenizers.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 458496882..6484d1cbb 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -11,7 +11,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer; use tk::pre_tokenizers::byte_level::ByteLevel; use tk::pre_tokenizers::delimiter::CharDelimiterSplit; use tk::pre_tokenizers::digits::Digits; -use tk::pre_tokenizers::metaspace::{Metaspace}; +use tk::pre_tokenizers::metaspace::Metaspace; use tk::pre_tokenizers::punctuation::Punctuation; use tk::pre_tokenizers::split::Split; use tk::pre_tokenizers::unicode_scripts::UnicodeScripts; @@ -511,7 +511,9 @@ impl PyMetaspace { // Extract values from _kwargs if present let prepend_scheme: Option = _kwargs.and_then(|kwargs| { - kwargs.get_item("prepend_scheme").and_then(|prepend_scheme| prepend_scheme.extract().ok()) + kwargs + .get_item("prepend_scheme") + .and_then(|prepend_scheme| prepend_scheme.extract().ok()) }); // Perform an action if prepend_scheme is present if let Some(prepend_scheme_value) = prepend_scheme { From 61073519d90c504f77071f77826788f1096a6857 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 14:36:41 +0100 Subject: [PATCH 26/47] remove impl from for PrependScheme --- tokenizers/src/pre_tokenizers/metaspace.rs | 33 ++-------------------- 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 7c85379e5..2d57b636d 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Deserializer, Serialize}; use std::fmt; /// Enum representing options for the metaspace prepending scheme. -#[derive(Debug, Clone, PartialEq, Serialize, Eq, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Eq, Deserialize, FromPyObject)] #[serde(rename_all = "snake_case")] pub enum PrependScheme { /// Specifies that the scheme should be prepended only once, on the first split. @@ -14,33 +14,6 @@ pub enum PrependScheme { Always, } -impl From<&str> for PrependScheme { - fn from(s: &str) -> Self { - match s.to_lowercase().as_str() { - "first" => PrependScheme::First, - "never" => PrependScheme::Never, - "always" => PrependScheme::Always, - _ => panic!("Invalid value for PrependScheme: {}", s), - } - } -} - -impl From for PrependScheme { - fn from(s: String) -> Self { - PrependScheme::from(s.as_str()) - } -} - -impl fmt::Display for PrependScheme { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - PrependScheme::First => write!(f, "first"), - PrependScheme::Never => write!(f, "never"), - PrependScheme::Always => write!(f, "always"), - } - } -} - #[derive(Debug, Clone, PartialEq, Serialize, Eq)] /// Replaces all the whitespaces by the provided meta character and then /// splits on this character @@ -120,8 +93,8 @@ impl Metaspace { self.replacement = replacement; self.str_rep = replacement.to_string(); } - pub fn set_prepend_scheme(&mut self, scheme: impl Into) { - self.prepend_scheme = scheme.into(); + pub fn set_prepend_scheme(&mut self, scheme: PrependScheme) { + self.prepend_scheme = scheme; } } From d86b036b816c601bf7fe78fe0c161d8001cc34fc Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 15:08:07 +0100 Subject: [PATCH 27/47] use simple getters and setters --- bindings/python/src/pre_tokenizers.rs | 60 ++++++++++++++++------ tokenizers/src/pre_tokenizers/metaspace.rs | 9 +++- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 6484d1cbb..e0e2c1da5 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -11,7 +11,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer; use tk::pre_tokenizers::byte_level::ByteLevel; use tk::pre_tokenizers::delimiter::CharDelimiterSplit; use tk::pre_tokenizers::digits::Digits; -use tk::pre_tokenizers::metaspace::Metaspace; +use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme}; use tk::pre_tokenizers::punctuation::Punctuation; use tk::pre_tokenizers::split::Split; use tk::pre_tokenizers::unicode_scripts::UnicodeScripts; @@ -491,12 +491,31 @@ impl PyMetaspace { #[getter] fn get_prepend_scheme(self_: PyRef) -> String { - getter!(self_, Metaspace, prepend_scheme.to_string()) + // Assuming Metaspace has a method to get the prepend_scheme as a string + let scheme: PrependScheme = getter!(self_, Metaspace, get_prepend_scheme()); + match scheme { + PrependScheme::First => "First", + PrependScheme::Never => "Never", + PrependScheme::Always => "Always", + }.to_string() } #[setter] - fn set_prepend_scheme(self_: PyRef, prepend_scheme: String) { - setter!(self_, Metaspace, @set_prepend_scheme, prepend_scheme); + fn set_prepend_scheme(self_: PyRef, prepend_scheme: &str) -> PyResult<()> { + // Assuming Metaspace has a method to set the prepend_scheme from a string + let scheme = match prepend_scheme.to_lowercase().as_str() { + "first" => PrependScheme::First, + "never" => PrependScheme::Never, + "always" => PrependScheme::Always, + _ => { + return Err(exceptions::PyValueError::new_err(format!( + "Unknown variant: {}", + prepend_scheme + ))) + } + }; + setter!(self_, Metaspace, @set_prepend_scheme, scheme); + Ok(()) } #[new] @@ -505,24 +524,35 @@ impl PyMetaspace { replacement: PyChar, add_prefix_space: bool, _kwargs: Option<&PyDict>, - ) -> (Self, PyPreTokenizer) { + ) -> PyResult<(Self, PyPreTokenizer)> { // Create a new Metaspace instance let mut new_instance: Metaspace = Metaspace::new(replacement.0, add_prefix_space); - // Extract values from _kwargs if present - let prepend_scheme: Option = _kwargs.and_then(|kwargs| { + if let Some(prepend_scheme) = _kwargs.and_then(|kwargs| { kwargs .get_item("prepend_scheme") - .and_then(|prepend_scheme| prepend_scheme.extract().ok()) - }); - // Perform an action if prepend_scheme is present - if let Some(prepend_scheme_value) = prepend_scheme { - // Do something with prepend_scheme_value - new_instance.set_prepend_scheme(prepend_scheme_value); + .and_then(|prepend_scheme| prepend_scheme.extract::().ok()) + }) { + // Unwrap the Option before calling to_lowercase + let prepend_scheme_value:String = prepend_scheme.to_lowercase(); + + let prepend_scheme_enum = match prepend_scheme_value.as_str() { + "first" => PrependScheme::First, + "never" => PrependScheme::Never, + "always" => PrependScheme::Always, + _ => { + return Err(exceptions::PyValueError::new_err(format!( + "Unknown variant: {}", + prepend_scheme_value + ))) + } + }; + + // Set the prepend_scheme in the new instance + new_instance.set_prepend_scheme(prepend_scheme_enum); } - // Return the new Metaspace instance and PyPreTokenizer - (PyMetaspace {}, new_instance.into()) + Ok((PyMetaspace {}, new_instance.into())) } } diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 2d57b636d..262747683 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -1,9 +1,8 @@ use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use serde::{Deserialize, Deserializer, Serialize}; -use std::fmt; /// Enum representing options for the metaspace prepending scheme. -#[derive(Debug, Clone, PartialEq, Serialize, Eq, Deserialize, FromPyObject)] +#[derive(Debug, Clone, PartialEq, Serialize, Eq, Deserialize)] #[serde(rename_all = "snake_case")] pub enum PrependScheme { /// Specifies that the scheme should be prepended only once, on the first split. @@ -93,9 +92,15 @@ impl Metaspace { self.replacement = replacement; self.str_rep = replacement.to_string(); } + + pub fn get_prepend_scheme(&self) -> PrependScheme { + self.prepend_scheme.clone() + } + pub fn set_prepend_scheme(&mut self, scheme: PrependScheme) { self.prepend_scheme = scheme; } + } impl Default for Metaspace { From 550304a2c1b2d54b91f6ff952a7a901dccaaccb0 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 15:08:50 +0100 Subject: [PATCH 28/47] lint --- bindings/python/src/pre_tokenizers.rs | 7 ++++--- tokenizers/src/pre_tokenizers/metaspace.rs | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index e0e2c1da5..0e26142b6 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -497,7 +497,8 @@ impl PyMetaspace { PrependScheme::First => "First", PrependScheme::Never => "Never", PrependScheme::Always => "Always", - }.to_string() + } + .to_string() } #[setter] @@ -524,7 +525,7 @@ impl PyMetaspace { replacement: PyChar, add_prefix_space: bool, _kwargs: Option<&PyDict>, - ) -> PyResult<(Self, PyPreTokenizer)> { + ) -> PyResult<(Self, PyPreTokenizer)> { // Create a new Metaspace instance let mut new_instance: Metaspace = Metaspace::new(replacement.0, add_prefix_space); @@ -534,7 +535,7 @@ impl PyMetaspace { .and_then(|prepend_scheme| prepend_scheme.extract::().ok()) }) { // Unwrap the Option before calling to_lowercase - let prepend_scheme_value:String = prepend_scheme.to_lowercase(); + let prepend_scheme_value: String = prepend_scheme.to_lowercase(); let prepend_scheme_enum = match prepend_scheme_value.as_str() { "first" => PrependScheme::First, diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 262747683..b9e76159c 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -100,7 +100,6 @@ impl Metaspace { pub fn set_prepend_scheme(&mut self, scheme: PrependScheme) { self.prepend_scheme = scheme; } - } impl Default for Metaspace { From a1bf2f93608d8efe3afa517cae5b184c61e27930 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 15:11:04 +0100 Subject: [PATCH 29/47] update tests --- tokenizers/src/pre_tokenizers/metaspace.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index b9e76159c..8db4975eb 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -246,19 +246,19 @@ mod tests { #[test] fn non_legacy_meta_space() { let mut pretok = Metaspace::new('▁', true); - pretok.set_prepend_scheme("always".to_string()); + pretok.set_prepend_scheme(PrependScheme::Always); assert_eq!( pretok, Metaspace::new_with_prepend_scheme('▁', true, PrependScheme::Always) ); - pretok.set_prepend_scheme("never".to_string()); + pretok.set_prepend_scheme(PrependScheme::Never); assert_eq!( pretok, Metaspace::new_with_prepend_scheme('▁', true, PrependScheme::Never) ); - pretok.set_prepend_scheme("first".to_string()); + pretok.set_prepend_scheme(PrependScheme::First); assert_eq!( pretok, Metaspace::new_with_prepend_scheme('▁', true, PrependScheme::First) @@ -288,7 +288,7 @@ mod tests { ("▁you", (35, 41)) ] ); - pretok.set_prepend_scheme("always"); + pretok.set_prepend_scheme(PrependScheme::Always); pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( pretokenized @@ -308,7 +308,7 @@ mod tests { ] ); - pretok.set_prepend_scheme("first"); + pretok.set_prepend_scheme(PrependScheme::First); let mut pretokenized = PreTokenizedString::from(" Hey how"); // test with prefix pretokenized .split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)) From 24d72e036ddc2da7b22ca38e1474b9e85148257a Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 15:12:10 +0100 Subject: [PATCH 30/47] add test new == new_with_prepend_scheme --- tokenizers/src/pre_tokenizers/metaspace.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 8db4975eb..e0ad0957b 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -245,6 +245,11 @@ mod tests { #[test] fn non_legacy_meta_space() { + assert_eq!( + Metaspace::new('▁', true), + Metaspace::new_with_prepend_scheme('▁', true, PrependScheme::Always) + ); + let mut pretok = Metaspace::new('▁', true); pretok.set_prepend_scheme(PrependScheme::Always); assert_eq!( From ab0e4279b8aebebebd4feba63e1c588f2497a135 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 15:16:39 +0100 Subject: [PATCH 31/47] revert a change --- tokenizers/src/tokenizer/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index d939fc6ce..2d7e10f73 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -658,10 +658,6 @@ where final_vocab } - /// Get the added tokens encoder - pub fn get_added_tokens_encoder(&self) -> HashMap { - self.added_vocabulary.get_vocab().clone() - } /// Get the added tokens decoder pub fn get_added_tokens_decoder(&self) -> HashMap { From eb2d0d88e0b7adc29d0ceae74574f39f26deec2d Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 15:20:04 +0100 Subject: [PATCH 32/47] use setters and getterts --- bindings/python/src/pre_tokenizers.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 0e26142b6..00b330807 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -494,9 +494,9 @@ impl PyMetaspace { // Assuming Metaspace has a method to get the prepend_scheme as a string let scheme: PrependScheme = getter!(self_, Metaspace, get_prepend_scheme()); match scheme { - PrependScheme::First => "First", - PrependScheme::Never => "Never", - PrependScheme::Always => "Always", + PrependScheme::First => "first", + PrependScheme::Never => "never", + PrependScheme::Always => "always", } .to_string() } From cc5519310e4568a2a307ac4a97a56cc827895df6 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 14 Nov 2023 15:39:49 +0100 Subject: [PATCH 33/47] Update bindings/python/src/pre_tokenizers.rs Co-authored-by: Nicolas Patry --- bindings/python/src/pre_tokenizers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 00b330807..fe29182dd 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -504,7 +504,7 @@ impl PyMetaspace { #[setter] fn set_prepend_scheme(self_: PyRef, prepend_scheme: &str) -> PyResult<()> { // Assuming Metaspace has a method to set the prepend_scheme from a string - let scheme = match prepend_scheme.to_lowercase().as_str() { + let scheme = match prepend_scheme.as_str() { "first" => PrependScheme::First, "never" => PrependScheme::Never, "always" => PrependScheme::Always, From e68bcc366b88d2bc5f9c9b5f33d94ef84e8775d1 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 15:44:37 +0100 Subject: [PATCH 34/47] nits --- bindings/python/src/pre_tokenizers.rs | 2 +- tokenizers/src/pre_tokenizers/metaspace.rs | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 00b330807..2c37698b7 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -510,7 +510,7 @@ impl PyMetaspace { "always" => PrependScheme::Always, _ => { return Err(exceptions::PyValueError::new_err(format!( - "Unknown variant: {}", + "Unknown variant: {}, should be one of First, Never, Always", prepend_scheme ))) } diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index e0ad0957b..ba1d8a1df 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -2,7 +2,7 @@ use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitD use serde::{Deserialize, Deserializer, Serialize}; /// Enum representing options for the metaspace prepending scheme. -#[derive(Debug, Clone, PartialEq, Serialize, Eq, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Eq, Deserialize, Copy)] #[serde(rename_all = "snake_case")] pub enum PrependScheme { /// Specifies that the scheme should be prepended only once, on the first split. @@ -63,12 +63,11 @@ impl<'de> Deserialize<'de> for Metaspace { impl Metaspace { pub fn new(replacement: char, add_prefix_space: bool) -> Self { - Self { + Self::new_with_prepend_scheme( replacement, - str_rep: replacement.to_string(), add_prefix_space, - prepend_scheme: PrependScheme::Always, // always prepend for legacy purpose - } + PrependScheme::Always, // always prepend for legacy purpose + ) } pub fn new_with_prepend_scheme( @@ -93,8 +92,8 @@ impl Metaspace { self.str_rep = replacement.to_string(); } - pub fn get_prepend_scheme(&self) -> PrependScheme { - self.prepend_scheme.clone() + pub fn get_prepend_scheme(&self) -> &PrependScheme { + &self.prepend_scheme } pub fn set_prepend_scheme(&mut self, scheme: PrependScheme) { From 20b0cf48cb05b19777f23c3e7e4d7d61c2b1e30c Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 15:55:06 +0100 Subject: [PATCH 35/47] use copy rather than ref --- tokenizers/src/pre_tokenizers/metaspace.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index ba1d8a1df..d7c8c3861 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -92,8 +92,8 @@ impl Metaspace { self.str_rep = replacement.to_string(); } - pub fn get_prepend_scheme(&self) -> &PrependScheme { - &self.prepend_scheme + pub fn get_prepend_scheme(&self) -> PrependScheme { + self.prepend_scheme } pub fn set_prepend_scheme(&mut self, scheme: PrependScheme) { From 6547fd883209f38ce6c88e9bc8e30d0b792b5e60 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 16:05:14 +0100 Subject: [PATCH 36/47] nits format --- bindings/python/src/pre_tokenizers.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index e9f0fb954..e5dad363c 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -500,19 +500,22 @@ impl PyMetaspace { } .to_string() } + const UNKNOWN_VARIANT_ERROR_MESSAGE: &str = + "is an unknown variant, should be one of ['First', 'Never', 'Always']"; #[setter] fn set_prepend_scheme(self_: PyRef, prepend_scheme: &str) -> PyResult<()> { // Assuming Metaspace has a method to set the prepend_scheme from a string - let scheme = match prepend_scheme.as_str() { - "first" => PrependScheme::First, - "never" => PrependScheme::Never, - "always" => PrependScheme::Always, + let scheme = match prepend_scheme { + "First" => PrependScheme::First, + "Never" => PrependScheme::Never, + "Always" => PrependScheme::Always, _ => { return Err(exceptions::PyValueError::new_err(format!( - "Unknown variant: {}, should be one of First, Never, Always", - prepend_scheme - ))) + "{} {}", + prepend_scheme, + Self::UNKNOWN_VARIANT_ERROR_MESSAGE, + ))); } }; setter!(self_, Metaspace, @set_prepend_scheme, scheme); @@ -543,9 +546,10 @@ impl PyMetaspace { "always" => PrependScheme::Always, _ => { return Err(exceptions::PyValueError::new_err(format!( - "Unknown variant: {}", - prepend_scheme_value - ))) + "{} {}", + prepend_scheme, + Self::UNKNOWN_VARIANT_ERROR_MESSAGE, + ))); } }; From 3ad148d38107eda4112419d81000e524fc0a4c96 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 16:06:36 +0100 Subject: [PATCH 37/47] more nits --- bindings/python/tests/bindings/test_pre_tokenizers.py | 4 ++-- tokenizers/src/pre_tokenizers/mod.rs | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index daf827b26..8008df4c8 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -110,8 +110,8 @@ def test_can_modify(self): assert pretok.replacement == "%" pretok.add_prefix_space = True assert pretok.add_prefix_space == True - pretok.prepend_scheme = "never" - assert pretok.prepend_scheme == "never" + pretok.prepend_scheme = "Never" + assert pretok.prepend_scheme == "Never" class TestCharDelimiterSplit: diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index fc8b9a0cf..42bbd15aa 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -126,7 +126,11 @@ mod tests { assert_eq!( pre_tokenizer, - PreTokenizerWrapper::Metaspace(Metaspace::new('▁', true)) + PreTokenizerWrapper::Metaspace(Metaspace::new_with_prepend_scheme( + '▁', + true, + metaspace::PrependScheme::Always + )) ); } From 4fb2cb0b2b27cadebef129da1b52111758b929de Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 16:11:06 +0100 Subject: [PATCH 38/47] allow option string --- bindings/python/src/pre_tokenizers.rs | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index e5dad363c..53dc30788 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -523,24 +523,19 @@ impl PyMetaspace { } #[new] - #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")] + #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, prepend_scheme=None, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")] fn new( replacement: PyChar, add_prefix_space: bool, + prepend_scheme: Option, _kwargs: Option<&PyDict>, ) -> PyResult<(Self, PyPreTokenizer)> { // Create a new Metaspace instance let mut new_instance: Metaspace = Metaspace::new(replacement.0, add_prefix_space); - if let Some(prepend_scheme) = _kwargs.and_then(|kwargs| { - kwargs - .get_item("prepend_scheme") - .and_then(|prepend_scheme| prepend_scheme.extract::().ok()) - }) { - // Unwrap the Option before calling to_lowercase - let prepend_scheme_value: String = prepend_scheme.to_lowercase(); - - let prepend_scheme_enum = match prepend_scheme_value.as_str() { + // If a prepend scheme is provided, set it + if let Some(prepend_scheme) = prepend_scheme { + let prepend_scheme_enum = match prepend_scheme.as_str() { "first" => PrependScheme::First, "never" => PrependScheme::Never, "always" => PrependScheme::Always, @@ -552,11 +547,8 @@ impl PyMetaspace { ))); } }; - - // Set the prepend_scheme in the new instance new_instance.set_prepend_scheme(prepend_scheme_enum); } - Ok((PyMetaspace {}, new_instance.into())) } } From d5491a022c98a6461dd12b397ed9a49392f509d2 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 16:18:47 +0100 Subject: [PATCH 39/47] enforce First Never Always camel cased --- bindings/python/src/pre_tokenizers.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 53dc30788..ead265c9d 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -494,9 +494,9 @@ impl PyMetaspace { // Assuming Metaspace has a method to get the prepend_scheme as a string let scheme: PrependScheme = getter!(self_, Metaspace, get_prepend_scheme()); match scheme { - PrependScheme::First => "first", - PrependScheme::Never => "never", - PrependScheme::Always => "always", + PrependScheme::First => "First", + PrependScheme::Never => "Never", + PrependScheme::Always => "Always", } .to_string() } @@ -536,9 +536,9 @@ impl PyMetaspace { // If a prepend scheme is provided, set it if let Some(prepend_scheme) = prepend_scheme { let prepend_scheme_enum = match prepend_scheme.as_str() { - "first" => PrependScheme::First, - "never" => PrependScheme::Never, - "always" => PrependScheme::Always, + "First" => PrependScheme::First, + "Never" => PrependScheme::Never, + "Always" => PrependScheme::Always, _ => { return Err(exceptions::PyValueError::new_err(format!( "{} {}", From e4d1fd6e80e7496c706dd0b1027d1336a9fd16ef Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 16:37:35 +0100 Subject: [PATCH 40/47] nits --- bindings/python/src/pre_tokenizers.rs | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index ead265c9d..3758bb423 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -494,30 +494,30 @@ impl PyMetaspace { // Assuming Metaspace has a method to get the prepend_scheme as a string let scheme: PrependScheme = getter!(self_, Metaspace, get_prepend_scheme()); match scheme { - PrependScheme::First => "First", - PrependScheme::Never => "Never", - PrependScheme::Always => "Always", + PrependScheme::First => "first", + PrependScheme::Never => "never", + PrependScheme::Always => "always", } .to_string() } - const UNKNOWN_VARIANT_ERROR_MESSAGE: &str = - "is an unknown variant, should be one of ['First', 'Never', 'Always']"; - #[setter] - fn set_prepend_scheme(self_: PyRef, prepend_scheme: &str) -> PyResult<()> { - // Assuming Metaspace has a method to set the prepend_scheme from a string + from_string(string: String) -> Result{ let scheme = match prepend_scheme { - "First" => PrependScheme::First, - "Never" => PrependScheme::Never, - "Always" => PrependScheme::Always, + "first" => PrependScheme::First, + "never" => PrependScheme::Never, + "always" => PrependScheme::Always, _ => { return Err(exceptions::PyValueError::new_err(format!( - "{} {}", - prepend_scheme, - Self::UNKNOWN_VARIANT_ERROR_MESSAGE, + "{} is an unknown variant, should be one of ['first', 'never', 'always']", ))); } }; + } + + #[setter] + fn set_prepend_scheme(self_: PyRef, prepend_scheme: &str) -> PyResult<()> { + // Assuming Metaspace has a method to set the prepend_scheme from a string + setter!(self_, Metaspace, @set_prepend_scheme, scheme); Ok(()) } From 0ed8d4bb7f7e399c8dc3e532e825fc78602aabaf Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 16:53:10 +0100 Subject: [PATCH 41/47] refactor --- bindings/python/src/pre_tokenizers.rs | 53 +++++++++++++-------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 3758bb423..5162b9d91 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -452,6 +452,21 @@ impl PySequence { } } +fn _from_string(string: String) -> Result{ + let scheme = match string.as_str() { + "first" => PrependScheme::First, + "never" => PrependScheme::Never, + "always" => PrependScheme::Always, + _ => { + return Err(exceptions::PyValueError::new_err(format!( + "{} is an unknown variant, should be one of ['first', 'never', 'always']", + string + ))); + } + }; + Ok(scheme) +} + /// Metaspace pre-tokenizer /// /// This pre-tokenizer replaces any whitespace by the provided replacement character. @@ -501,23 +516,9 @@ impl PyMetaspace { .to_string() } - from_string(string: String) -> Result{ - let scheme = match prepend_scheme { - "first" => PrependScheme::First, - "never" => PrependScheme::Never, - "always" => PrependScheme::Always, - _ => { - return Err(exceptions::PyValueError::new_err(format!( - "{} is an unknown variant, should be one of ['first', 'never', 'always']", - ))); - } - }; - } - #[setter] - fn set_prepend_scheme(self_: PyRef, prepend_scheme: &str) -> PyResult<()> { - // Assuming Metaspace has a method to set the prepend_scheme from a string - + fn set_prepend_scheme(self_: PyRef, prepend_scheme: String) -> PyResult<()> { + let scheme = _from_string(prepend_scheme)?; setter!(self_, Metaspace, @set_prepend_scheme, scheme); Ok(()) } @@ -535,19 +536,15 @@ impl PyMetaspace { // If a prepend scheme is provided, set it if let Some(prepend_scheme) = prepend_scheme { - let prepend_scheme_enum = match prepend_scheme.as_str() { - "First" => PrependScheme::First, - "Never" => PrependScheme::Never, - "Always" => PrependScheme::Always, - _ => { - return Err(exceptions::PyValueError::new_err(format!( - "{} {}", - prepend_scheme, - Self::UNKNOWN_VARIANT_ERROR_MESSAGE, - ))); + match _from_string(prepend_scheme) { + Ok(prepend_scheme_enum) => { + new_instance.set_prepend_scheme(prepend_scheme_enum); } - }; - new_instance.set_prepend_scheme(prepend_scheme_enum); + Err(err) => { + // Handle the error, you can return it or exit the function + return Err(err.into()); + } + } } Ok((PyMetaspace {}, new_instance.into())) } From cc4e4e647d947038b947103755b2bd359040bb13 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 16:54:30 +0100 Subject: [PATCH 42/47] update test as well --- bindings/python/tests/bindings/test_pre_tokenizers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index 8008df4c8..daf827b26 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -110,8 +110,8 @@ def test_can_modify(self): assert pretok.replacement == "%" pretok.add_prefix_space = True assert pretok.add_prefix_space == True - pretok.prepend_scheme = "Never" - assert pretok.prepend_scheme == "Never" + pretok.prepend_scheme = "never" + assert pretok.prepend_scheme == "never" class TestCharDelimiterSplit: From 2f67039ccda43f01498b24b7da3e01f26373c2d6 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 16:59:46 +0100 Subject: [PATCH 43/47] fmt --- bindings/python/src/pre_tokenizers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 5162b9d91..84414f652 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -452,7 +452,7 @@ impl PySequence { } } -fn _from_string(string: String) -> Result{ +fn _from_string(string: String) -> Result { let scheme = match string.as_str() { "first" => PrependScheme::First, "never" => PrependScheme::Never, From 88d862b2a19b319b5d46260c2f7bf7cdde0199c0 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 17:04:32 +0100 Subject: [PATCH 44/47] nits --- bindings/python/src/pre_tokenizers.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 84414f652..0fd9e9d40 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -536,14 +536,10 @@ impl PyMetaspace { // If a prepend scheme is provided, set it if let Some(prepend_scheme) = prepend_scheme { - match _from_string(prepend_scheme) { - Ok(prepend_scheme_enum) => { - new_instance.set_prepend_scheme(prepend_scheme_enum); - } - Err(err) => { - // Handle the error, you can return it or exit the function - return Err(err.into()); - } + if let Ok(prepend_scheme_enum) = _from_string(prepend_scheme) { + new_instance.set_prepend_scheme(prepend_scheme_enum); + } else { + return Err(err); } } Ok((PyMetaspace {}, new_instance.into())) From c6469782fb507368ff898da63b68485ad0ea4f0d Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 17:07:34 +0100 Subject: [PATCH 45/47] properly error out --- bindings/python/src/pre_tokenizers.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 0fd9e9d40..56219e770 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -536,10 +536,9 @@ impl PyMetaspace { // If a prepend scheme is provided, set it if let Some(prepend_scheme) = prepend_scheme { - if let Ok(prepend_scheme_enum) = _from_string(prepend_scheme) { - new_instance.set_prepend_scheme(prepend_scheme_enum); - } else { - return Err(err); + match _from_string(prepend_scheme) { + Ok(prepend_scheme_enum) => new_instance.set_prepend_scheme(prepend_scheme_enum), + Err(err) => return Err(err), } } Ok((PyMetaspace {}, new_instance.into())) From ac646fa80f6e4d9dbde042d75dd4d16dddd7921f Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 14 Nov 2023 17:52:05 +0100 Subject: [PATCH 46/47] Update bindings/python/src/pre_tokenizers.rs Co-authored-by: Nicolas Patry --- bindings/python/src/pre_tokenizers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 56219e770..6c819ca3a 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -452,7 +452,7 @@ impl PySequence { } } -fn _from_string(string: String) -> Result { +fn from_string(string: String) -> Result { let scheme = match string.as_str() { "first" => PrependScheme::First, "never" => PrependScheme::Never, From e41955b9cae7e9582c1d4c36565457031b8544bb Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 14 Nov 2023 17:53:54 +0100 Subject: [PATCH 47/47] suggestion changes --- bindings/python/src/pre_tokenizers.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 6c819ca3a..0f64357a3 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -518,7 +518,7 @@ impl PyMetaspace { #[setter] fn set_prepend_scheme(self_: PyRef, prepend_scheme: String) -> PyResult<()> { - let scheme = _from_string(prepend_scheme)?; + let scheme = from_string(prepend_scheme)?; setter!(self_, Metaspace, @set_prepend_scheme, scheme); Ok(()) } @@ -536,7 +536,7 @@ impl PyMetaspace { // If a prepend scheme is provided, set it if let Some(prepend_scheme) = prepend_scheme { - match _from_string(prepend_scheme) { + match from_string(prepend_scheme) { Ok(prepend_scheme_enum) => new_instance.set_prepend_scheme(prepend_scheme_enum), Err(err) => return Err(err), }