Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) …

…in the decoder (#1513)" This reverts commit 25aee8b.
huggingface · Jul 12, 2024 · 3eed134 · 3eed134
1 parent fdd26ba
commit 3eed134
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 51 deletions.
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -63,12 +63,6 @@ jobs:
           toolchain: stable
           components: rustfmt, clippy
 
-      - name: Install audit
-        uses: actions-rs/cargo@v1
-        with:
-          command: install
-          args: cargo-audit
-
       - name: Install Python
         uses: actions/setup-python@v4
         with:

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -36,12 +36,6 @@ jobs:
           command: install
           args: cargo-readme
 
-      - name: Install audit
-        uses: actions-rs/cargo@v1
-        with:
-          command: install
-          args: cargo-audit
-
       - name: Build
         uses: actions-rs/cargo@v1
         with:

diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -216,21 +216,13 @@ impl AddedVocabulary {
     }
 
     /// Get the token matching the given id if it exists
-    #[deprecated(
-        since = "0.19.0",
-        note = "please use `added_vocabulary.simple_id_to_token(id).or_else(|| model.id_to_token(id)` instead"
-    )]
     pub fn id_to_token(&self, id: u32, model: &impl Model) -> Option<String> {
         self.added_tokens_map_r
             .get(&id)
             .map(|t| t.content.clone())
             .or_else(|| model.id_to_token(id))
     }
 
-    pub fn simple_id_to_token(&self, id: u32) -> Option<String> {
-        self.added_tokens_map_r.get(&id).map(|t| t.content.clone())
-    }
-
     //
     pub fn set_encode_special_tokens(&mut self, value: bool) {
         self.encode_special_tokens = value;

diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -699,9 +699,7 @@ where
 
     /// Converts an id to the corresponding token.
     pub fn id_to_token(&self, id: u32) -> Option<String> {
-        self.added_vocabulary
-            .simple_id_to_token(id)
-            .or_else(|| self.model.id_to_token(id))
+        self.added_vocabulary.id_to_token(id, &self.model)
     }
 
     /// set the added bocab's splitting scheme
@@ -847,35 +845,22 @@ where
 
     /// Decode the given ids, back to a String
     pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> {
-        let mut result = String::with_capacity(ids.len());
-        let mut chunks = Vec::with_capacity(ids.len());
-        for id in ids {
-            if let Some(added_token) = self.added_vocabulary.simple_id_to_token(*id) {
-                if skip_special_tokens && self.added_vocabulary.is_special_token(&added_token) {
-                    continue;
-                }
-                let text_chunk = if let Some(decoder) = &self.decoder {
-                    decoder.decode(chunks.clone())?
-                } else {
-                    chunks.join(" ")
-                };
-                result.push_str(&text_chunk);
-                if !result.is_empty() && self.decoder.is_none() {
-                    result.push(' ');
-                }
-                result.push_str(&added_token);
-                chunks.clear();
-            } else if let Some(token) = self.model.id_to_token(*id) {
-                chunks.push(token);
-            }
-        }
-        let text_chunk = if let Some(decoder) = &self.decoder {
-            decoder.decode(chunks.clone())?
+        let tokens = ids
+            .iter()
+            .filter_map(|id| {
+                self.added_vocabulary
+                    .id_to_token(*id, &self.model)
+                    .filter(|token| {
+                        !skip_special_tokens || !self.added_vocabulary.is_special_token(token)
+                    })
+            })
+            .collect::<Vec<_>>();
+
+        if let Some(decoder) = &self.decoder {
+            decoder.decode(tokens)
         } else {
-            chunks.join(" ")
-        };
-        result.push_str(&text_chunk);
-        Ok(result)
+            Ok(tokens.join(" "))
+        }
     }
 }