From 38003ad370fe26c71febad767135b164c7cb57c0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 20 Jun 2024 16:19:42 +0200
Subject: [PATCH 1/2] sounds fun

what I hope for

nit

Is this what's expected?
---
 tokenizers/src/pre_tokenizers/byte_level.rs  | 29 ++++++++-
 tokenizers/src/tokenizer/added_vocabulary.rs | 65 ++++++++++++++++++--
 tokenizers/src/tokenizer/mod.rs              |  2 +-
 3 files changed, 89 insertions(+), 7 deletions(-)
diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
index 2d3845b55..13d3fbef4 100644
--- a/tokenizers/src/pre_tokenizers/byte_level.rs
+++ b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -41,6 +41,14 @@ lazy_static! {
         r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
     )
     .unwrap();
+    static ref RE_VEC: Vec<SysRegex> = {
+        let pattern = r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
+        let mut vec = Vec::with_capacity(MAX_NUM_THREADS);
+        for _ in 0..MAX_NUM_THREADS {
+            vec.push(SysRegex::new(pattern).unwrap());
+        }
+        vec
+    };
     static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
     static ref CHAR_BYTES: HashMap<char, u8> =
         bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
@@ -111,12 +119,31 @@ impl ByteLevel {
     }
 }
 
+use std::num::NonZeroU64;
+use std::thread;
+
+pub struct FakeThreadId(NonZeroU64);
+
+fn hash_current_thread() -> usize {
+    // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter
+    // that works great for our use case of avoiding collisions in our array. Unfortunately,
+    // it's private. However, there are only so many ways you can layout a u64, so just transmute
+    // https://github.com/rust-lang/rust/issues/67939
+    const _: [u8; 8] = [0; std::mem::size_of::<thread::ThreadId>()];
+    const _: [u8; 8] = [0; std::mem::size_of::<FakeThreadId>()];
+    let x =
+        unsafe { std::mem::transmute::<thread::ThreadId, FakeThreadId>(thread::current().id()).0 };
+    u64::from(x) as usize - 1
+}
+
+const MAX_NUM_THREADS: usize = 128;
+
 /// As a `PreTokenizer`, `ByteLevel` is in charge of transforming all the unicode characters into
 /// their byte-level counterpart. It also splits the input according to the configured regex.
 // TODO: Give the ability to modify this regex
 impl PreTokenizer for ByteLevel {
     fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
-        let re_ref: &SysRegex = &RE;
+        let re_ref: &SysRegex = &RE_VEC[hash_current_thread() % MAX_NUM_THREADS]; // TODO use the thread thing here as well!
         pretokenized.split(|_, mut normalized| {
             if self.add_prefix_space && !normalized.get().starts_with(' ') {
                 normalized.prepend(" ");
diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
index a0c2f4542..e33ab86b0 100644
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -92,6 +92,25 @@ impl std::hash::Hash for AddedToken {
     }
 }
 
+use std::num::NonZeroU64;
+use std::thread;
+
+pub struct FakeThreadId(NonZeroU64);
+
+fn hash_current_thread() -> usize {
+    // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter
+    // that works great for our use case of avoiding collisions in our array. Unfortunately,
+    // it's private. However, there are only so many ways you can layout a u64, so just transmute
+    // https://github.com/rust-lang/rust/issues/67939
+    const _: [u8; 8] = [0; std::mem::size_of::<thread::ThreadId>()];
+    const _: [u8; 8] = [0; std::mem::size_of::<FakeThreadId>()];
+    let x =
+        unsafe { std::mem::transmute::<thread::ThreadId, FakeThreadId>(thread::current().id()).0 };
+    u64::from(x) as usize
+}
+
+const MAX_NUM_THREADS: usize = 128;
+
 type MatchingSet = (AhoCorasick, Vec<u32>);
 
 lazy_static! {
@@ -156,11 +175,16 @@ pub struct AddedVocabulary {
     /// us remove them easily with an O(1) complexity.
     special_tokens_set: HashSet<String>,
 
-    /// A RegexSet containing all the non-normalized patterns used to split on AddedTokens
+    //// A RegexSet containing all the non-normalized patterns used to split on AddedTokens
     split_trie: MatchingSet,
     /// A RegexSet containing all the normalized patterns used to split on AddedTokens
     split_normalized_trie: MatchingSet,
 
+    // A RegexSet containing all the non-normalized patterns used to split on AddedTokens
+    split_trie_vec: Vec<MatchingSet>,
+    /// A RegexSet containing all the normalized patterns used to split on AddedTokens
+    split_normalized_trie_vec: Vec<MatchingSet>,
+
     /// Whether or not special tokens should be splitted when encoding. This is equivalent to ignoring them
     encode_special_tokens: bool,
 }
@@ -181,8 +205,10 @@ impl AddedVocabulary {
             added_tokens: vec![],
             special_tokens: vec![],
             special_tokens_set: HashSet::new(),
-            split_trie: (trie, vec![]),
-            split_normalized_trie: (normalized_trie, vec![]),
+            split_trie: (trie.clone(), vec![]),
+            split_normalized_trie: (normalized_trie.clone(), vec![]),
+            split_trie_vec: vec![(trie, vec![]); MAX_NUM_THREADS],
+            split_normalized_trie_vec: vec![(normalized_trie, vec![]); MAX_NUM_THREADS],
             encode_special_tokens: false,
         }
     }
@@ -345,6 +371,7 @@ impl AddedVocabulary {
             .build(tokens.iter().map(|token| &token.content))
             .expect("Failed to build tried when refreshing tokens");
         self.split_trie = (trie, ids);
+        self.split_trie_vec = vec![self.split_trie.clone(); MAX_NUM_THREADS];
 
         let (ntokens, nids): (Vec<&AddedToken>, Vec<u32>) = normalized.into_iter().unzip();
         let patterns: Vec<_> = ntokens
@@ -362,6 +389,7 @@ impl AddedVocabulary {
             .build(patterns.iter().map(|content| content.get()))
             .expect("Failed to build tried when refreshing tokens (normalized)");
         self.split_normalized_trie = (normalized_trie, nids);
+        self.split_normalized_trie_vec = vec![self.split_normalized_trie.clone(); MAX_NUM_THREADS];
     }
 
     /// Find any AddedToken in the given sentence, using the provided MatchingSet.
@@ -425,6 +453,25 @@ impl AddedVocabulary {
         splits
     }
 
+    fn fast_split_with_indices(
+        &self,
+        sentence: NormalizedString,
+        split_re: &MatchingSet,
+    ) -> Vec<(NormalizedString, Option<Vec<Token>>)> {
+        self.find_matches(sentence.get(), split_re)
+            .into_iter()
+            .map(|(id, byte_offsets)| {
+                let slice = sentence
+                    .slice(Range::Normalized(byte_offsets.0..byte_offsets.1))
+                    .expect("AddedVocabulary bad split");
+                if let Some(id) = id {
+                    (slice, Some(vec![Token::new(id, String::new(), (0, 0))]))
+                } else {
+                    (slice, None)
+                }
+            })
+            .collect()
+    }
     /// Split the input sentence to extract anything we found from the `MatchingSet`, as well as
     /// the list of corresponding IDs
     /// The list of IDs have the exact same number of elements than the Iterator.
@@ -465,7 +512,12 @@ impl AddedVocabulary {
 
         // 1. We extract all the non-normalized tokens from the non-normalized string
         pretokenized
-            .split(|_, sequence| Ok(self.split_with_indices(sequence, &self.split_trie)))
+            .split(|_, sequence| {
+                Ok(self.fast_split_with_indices(
+                    sequence,
+                    &self.split_trie_vec[hash_current_thread() % MAX_NUM_THREADS],
+                ))
+            })
             .expect("AddedVocabulary bad split");
 
         // <s> normalized = False
@@ -484,7 +536,10 @@ impl AddedVocabulary {
         pretokenized
             .split(|_, mut sequence| {
                 normalizer.map(|n| n.normalize(&mut sequence));
-                Ok(self.split_with_indices(sequence, &self.split_normalized_trie))
+                Ok(self.fast_split_with_indices(
+                    sequence,
+                    &self.split_normalized_trie_vec[hash_current_thread() % MAX_NUM_THREADS],
+                ))
             })
             .expect("AddedVocabulary bad split");
 
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index 1c2ad6e0b..4bde4d39d 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -881,7 +881,7 @@ where
     ) -> Result<Encoding> {
         let mut pretokenized: PreTokenizedString = pretokenized.into();
         pretokenized.tokenize(|normalized| self.model.tokenize(normalized.get()))?;
-        pretokenized.into_encoding(word_idx, type_id, offsets_type)
+        pretokenized.fast_into_encoding()
     }
 }
 

From b6d01b788a9561c414d8efdd7e174da8b98fc5d6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 20 Jun 2024 16:01:51 +0200
Subject: [PATCH 2/2] initial commit

---
 tokenizers/src/tokenizer/added_vocabulary.rs |  1 +
 tokenizers/src/tokenizer/pre_tokenizer.rs    | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
index e33ab86b0..7db205189 100644
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -453,6 +453,7 @@ impl AddedVocabulary {
         splits
     }
 
+
     fn fast_split_with_indices(
         &self,
         sentence: NormalizedString,
diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs
index 9667c240a..be919fd62 100644
--- a/tokenizers/src/tokenizer/pre_tokenizer.rs
+++ b/tokenizers/src/tokenizer/pre_tokenizer.rs
@@ -186,6 +186,25 @@ impl PreTokenizedString {
         }
     }
 
+    pub fn fast_into_encoding(self) -> Result<Encoding> {
+        if self.splits.is_empty() {
+            Ok(Encoding::default())
+        } else if !self.splits.iter().all(|split| split.tokens.is_some()) {
+            Err("Split has not been tokenized.".into())
+        } else {
+            let tokens = self
+                .splits
+                .into_iter()
+                .flat_map(|split| {
+                    split.tokens.unwrap().into_iter().map(|token| {
+                        // Replace this with the actual fields you need for the Encoding type
+                        (token.id, String::new(), (0, 0), None, 0)
+                    })
+                })
+                .collect();
+            Ok(tokens)
+        }
+    }
     /// Returns a list of splits, each of them being a slice of the normalized
     /// string, the associated offsets either in original or normalized
     /// referential, as well as the potention tokens