initial commit

huggingface · Aug 8, 2024 · b6d01b7 · b6d01b7
1 parent 38003ad
commit b6d01b7
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 0 deletions.
diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -453,6 +453,7 @@ impl AddedVocabulary {
         splits
     }
 
+
     fn fast_split_with_indices(
         &self,
         sentence: NormalizedString,

diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs
@@ -186,6 +186,25 @@ impl PreTokenizedString {
         }
     }
 
+    pub fn fast_into_encoding(self) -> Result<Encoding> {
+        if self.splits.is_empty() {
+            Ok(Encoding::default())
+        } else if !self.splits.iter().all(|split| split.tokens.is_some()) {
+            Err("Split has not been tokenized.".into())
+        } else {
+            let tokens = self
+                .splits
+                .into_iter()
+                .flat_map(|split| {
+                    split.tokens.unwrap().into_iter().map(|token| {
+                        // Replace this with the actual fields you need for the Encoding type
+                        (token.id, String::new(), (0, 0), None, 0)
+                    })
+                })
+                .collect();
+            Ok(tokens)
+        }
+    }
     /// Returns a list of splits, each of them being a slice of the normalized
     /// string, the associated offsets either in original or normalized
     /// referential, as well as the potention tokens
-Original file line number
+Diff line change
@@ Expand Up / @@ -453,6 +453,7 @@ impl AddedVocabulary { @@
             splits
         }
         fn fast_split_with_indices(
             &self,
             sentence: NormalizedString,
@@ Expand Down @@