From 6c713d2db7261297aabcb34688d055ef8e5ade1e Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 11 Jul 2024 17:57:57 +0200 Subject: [PATCH] splitting still an issue --- tokenizers/src/pre_tokenizers/metaspace.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 0237949b5..1c06b38cf 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -128,6 +128,24 @@ impl PreTokenizer for Metaspace { NormalizedString::from(self.replacement.to_string()).into(), ); } + pretokenized.split(|_, mut normalized| { + normalized.replace(' ', &self.str_rep)?; + match self.prepend_scheme { + PrependScheme::Always => { + if !normalized.get().starts_with(self.replacement) { + normalized.prepend(&self.str_rep); + } + } + PrependScheme::Never => {} + _ => {} // Handle other cases if needed + }; + if self.split { + normalized.split(self.replacement, SplitDelimiterBehavior::MergedWithNext) + } else { + Ok(vec![normalized]) + } + }); + Ok(()) } else { pretokenized.split(|_, mut normalized| {