fix everything

huggingface · Jul 12, 2024 · 7ff8935 · 7ff8935
1 parent 8d4fae8
commit 7ff8935
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 15 deletions.
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -1217,7 +1217,6 @@ impl PyTokenizer {
             processed_old_tokens.push(old_token);
             processed_new_tokens.push(new_token);
         }
-
         Ok(self
             .tokenizer
             .assign_tokens(&processed_old_tokens, &processed_new_tokens))

diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -322,7 +322,7 @@ impl AddedVocabulary {
                     .lock()
                     .unwrap()
                     .entry(id)
-                    .and_modify(|t| *t = new.clone());
+                    .and_modify(|t| t.content = new.content.clone());
                 self.refresh_added_tokens(model, normalizer);
             } else {
                 error!("Error: you tried to re-assign a token that does not exist in the added vocab. Make sure {:?} is first added to the vocab", old.content.clone())
@@ -336,17 +336,12 @@ impl AddedVocabulary {
     /// non-normalized string, and one matching against the normalized one.
     fn refresh_added_tokens<N: Normalizer>(&mut self, model: &impl Model, normalizer: Option<&N>) {
         type TupleTokenId<'a> = (&'a AddedToken, u32);
-        let (normalized, non_normalized): (Vec<TupleTokenId>, Vec<TupleTokenId>) = self
-            .added_tokens
-            .iter()
-            .map(|token| {
-                (
-                    token,
-                    self.token_to_id(&token.content, model)
-                        .expect("Missing additional token"),
-                )
-            })
-            .partition(|(token, _)| token.normalized);
+        let added_tokens_map_r = self.added_tokens_map_r.lock().unwrap().clone();
+        let (normalized, non_normalized): (Vec<TupleTokenId>, Vec<TupleTokenId>) =
+            added_tokens_map_r
+                .iter()
+                .map(|(id, token)| (token, *id))
+                .partition(|(token, _)| token.normalized);
 
         let (tokens, ids): (Vec<&AddedToken>, Vec<u32>) = non_normalized.into_iter().unzip();
         let trie = AhoCorasickBuilder::new()
@@ -363,6 +358,7 @@ impl AddedVocabulary {
                 if let Some(n) = normalizer {
                     n.normalize(&mut content).unwrap();
                 }
+                println!("{:?}", token);
                 content
             })
             .collect();

diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -541,9 +541,7 @@ where
             model,
             post_processor: None,
             decoder: None,
-
             added_vocabulary: AddedVocabulary::new(),
-
             truncation: None,
             padding: None,
         }