Merge branch 'main' of github.com:huggingface/tokenizers into branch_…

…v0.15.1.rc1
huggingface · Jan 19, 2024 · f387e2a · f387e2a
2 parents 7fafaf7 + 6a77d48
commit f387e2a
Show file tree

Hide file tree

Showing 13 changed files with 197 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -70,7 +70,5 @@ print(output.tokens)
 # ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
 ```
 
-Check the [python documentation](https://huggingface.co/docs/tokenizers/index) or the
-
-[python quicktour](https://huggingface.co/docs/tokenizers/python/latest/quicktour.html) to learn
-more!
+Check the [documentation](https://huggingface.co/docs/tokenizers/index)
+or the [quicktour](https://huggingface.co/docs/tokenizers/quicktour) to learn more!
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
 serde_json = "1.0"
 libc = "0.2"
 env_logger = "0.10.0"
-pyo3 = { version = "0.19" }
-numpy = "0.19.0"
+pyo3 = { version = "0.20" }
+numpy = "0.20.0"
 ndarray = "0.15"
 onig = { version = "6.4", default-features = false }
 itertools = "0.11"
@@ -26,7 +26,7 @@ path = "../../tokenizers"
 
 [dev-dependencies]
 tempfile = "3.8"
-pyo3 = { version = "0.19", features = ["auto-initialize"] }
+pyo3 = { version = "0.20", features = ["auto-initialize"] }
 
 [features]
 defaut = ["pyo3/extension-module"]
diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi
@@ -836,6 +836,18 @@ class Tokenizer:
         Returns:
             A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
 
+        """
+        pass
+    @property
+    def encode_special_tokens(self):
+        """
+        Modifies the tokenizer in order to use or not the special tokens
+        during encoding.
+
+        Args:
+            value (:obj:`bool`):
+                Whether to use the special tokens or not
+
         """
         pass
     @staticmethod

diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
@@ -266,15 +266,15 @@ impl FromPyObject<'_> for PySpecialToken {
             Ok(Self(v.into()))
         } else if let Ok(d) = ob.downcast::<PyDict>() {
             let id = d
-                .get_item("id")
+                .get_item("id")?
                 .ok_or_else(|| exceptions::PyValueError::new_err("`id` must be specified"))?
                 .extract::<String>()?;
             let ids = d
-                .get_item("ids")
+                .get_item("ids")?
                 .ok_or_else(|| exceptions::PyValueError::new_err("`ids` must be specified"))?
                 .extract::<Vec<u32>>()?;
             let tokens = d
-                .get_item("tokens")
+                .get_item("tokens")?
                 .ok_or_else(|| exceptions::PyValueError::new_err("`tokens` must be specified"))?
                 .extract::<Vec<String>>()?;
 

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -7,7 +7,6 @@ use pyo3::exceptions;
 use pyo3::intern;
 use pyo3::prelude::*;
 use pyo3::types::*;
-use pyo3::AsPyPointer;
 use tk::models::bpe::BPE;
 use tk::tokenizer::{
     Model, PaddingDirection, PaddingParams, PaddingStrategy, PostProcessor, TokenizerImpl,
@@ -1110,6 +1109,25 @@ impl PyTokenizer {
         self.tokenizer.id_to_token(id)
     }
 
+    /// Modifies the tokenizer in order to use or not the special tokens
+    /// during encoding.
+    ///
+    /// Args:
+    ///     value (:obj:`bool`):
+    ///         Whether to use the special tokens or not
+    ///
+    #[setter]
+    fn set_encode_special_tokens(&mut self, value: bool) {
+        self.tokenizer.set_encode_special_tokens(value);
+    }
+    /// Get the value of the `encode_special_tokens` attribute
+    ///
+    /// Returns:
+    ///     :obj:`bool`: the tokenizer's encode_special_tokens attribute
+    #[getter]
+    fn get_encode_special_tokens(&self) -> bool {
+        self.tokenizer.get_encode_special_tokens()
+    }
     /// Add the given tokens to the vocabulary
     ///
     /// The given tokens are added only if they don't already exist in the vocabulary.

diff --git a/bindings/python/src/utils/iterators.rs b/bindings/python/src/utils/iterators.rs
@@ -1,5 +1,4 @@
 use pyo3::prelude::*;
-use pyo3::AsPyPointer;
 use std::collections::VecDeque;
 
 /// An simple iterator that can be instantiated with a specified length.

diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
@@ -457,3 +457,34 @@ def test_unigram_byte_fallback(self):
         output = tokenizer.encode("A sentence 🤗")
         assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
         assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
+
+    def test_encode_special_tokens(self):
+        tokenizer = Tokenizer.from_pretrained("t5-base")
+        tokenizer.add_tokens(["<eot>"])
+        tokenizer.add_special_tokens(["<end_of_text>"])
+        output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
+        assert output.tokens == ["▁Hey", "▁there", "<end_of_text>", "▁dear", "<eot>", "▁friend", "!"]
+
+        tokenizer.encode_special_tokens = True
+        assert tokenizer.encode_special_tokens == True
+
+        output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
+        assert output.tokens == [
+            "▁Hey",
+            "▁there",
+            "<",
+            "end",
+            "_",
+            "of",
+            "_",
+            "text",
+            ">",
+            "▁dear",
+            "<eot>",
+            "▁friend",
+            "!",
+        ]
+
+        tokenizer.add_tokens(["of_text>"])
+        output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
+        assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]
diff --git a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
@@ -70,8 +70,10 @@ def test_datasets(self):
 
         # START def_batch_iterator
         def batch_iterator(batch_size=1000):
-            for i in range(0, len(dataset), batch_size):
-                yield dataset[i : i + batch_size]["text"]
+            # Only keep the text column to avoid decoding the rest of the columns unnecessarily
+            tok_dataset = dataset.select_columns("text")
+            for batch in tok_dataset.iter(batch_size):
+                yield batch["text"]
 
         # END def_batch_iterator
 

diff --git a/tokenizers/Makefile b/tokenizers/Makefile
@@ -4,9 +4,9 @@ TESTS_DIR = tests
 
 dir_guard=@mkdir -p $(@D)
 
-SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
+SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json 
 BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
-TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
+TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
 
 .PHONY : build
 build :

diff --git a/tokenizers/examples/unstable_wasm/www/package-lock.json b/tokenizers/examples/unstable_wasm/www/package-lock.json
diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -9,6 +9,8 @@ use crate::tokenizer::{
 };
 use crate::utils::macro_rules_attribute;
 
+/// Converts bytes to unicode characters.
+/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
 fn bytes_char() -> HashMap<u8, char> {
     let mut bs: Vec<u8> = vec![];
     bs.extend(b'!'..=b'~');
@@ -33,6 +35,8 @@ fn bytes_char() -> HashMap<u8, char> {
 }
 
 lazy_static! {
+    /// Regex that matches exactly one token.
+    /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
     static ref RE: SysRegex = SysRegex::new(
         r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
     )

diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -160,6 +160,9 @@ pub(super) struct AddedVocabulary {
     split_trie: MatchingSet,
     /// A RegexSet containing all the normalized patterns used to split on AddedTokens
     split_normalized_trie: MatchingSet,
+
+    /// Whether or not special tokens should be splitted when encoding. This is equivalent to ignoring them
+    encode_special_tokens: bool,
 }
 
 impl AddedVocabulary {
@@ -180,6 +183,7 @@ impl AddedVocabulary {
             special_tokens_set: HashSet::new(),
             split_trie: (trie, vec![]),
             split_normalized_trie: (normalized_trie, vec![]),
+            encode_special_tokens: false,
         }
     }
     /// Size of the additional vocabulary
@@ -214,6 +218,15 @@ impl AddedVocabulary {
             .or_else(|| model.id_to_token(id))
     }
 
+    //
+    pub fn set_encode_special_tokens(&mut self, value: bool) {
+        self.encode_special_tokens = value;
+    }
+
+    pub fn get_encode_special_tokens(&self) -> bool {
+        self.encode_special_tokens
+    }
+
     /// Check if a token is a special token
     pub fn is_special_token(&self, token: &str) -> bool {
         self.special_tokens_set.contains(token)
@@ -356,6 +369,12 @@ impl AddedVocabulary {
             let aho_id = mat.pattern();
             let id = split_re.1[aho_id];
             let added_token = &self.added_tokens_map_r.get(&id).unwrap();
+
+            if self.encode_special_tokens && self.special_tokens_set.contains(&added_token.content)
+            {
+                continue;
+            }
+
             if added_token.single_word {
                 let start_space = start == 0 || !ends_with_word(&sentence[..start]);
                 let stop_space = stop == sentence.len() || !starts_with_word(&sentence[stop..]);
@@ -436,6 +455,18 @@ impl AddedVocabulary {
             .split(|_, sequence| Ok(self.split_with_indices(sequence, &self.split_trie)))
             .expect("AddedVocabulary bad split");
 
+        // <s> normalized = False
+        // "I read a book   <s>Hey" -> "I read a book", "   <s>", "Hey"
+
+        // </s> normalized = True -> "▁</s>"
+        // "I read a book</s>Hey" -> "I read a book</s>Hey"
+
+        // Day normalized = True -> "Day"
+        // "I read a book monday" -> "I read a book monday"
+
+        // [DAY] normalized = False -> "Day"
+        // "I read a [DAY] monday" -> "I read a " "[DAY]", "book monday"
+        //                                         320055
         // 2. Then extract the normalized tokens from the normalized pieces of the string
         pretokenized
             .split(|_, mut sequence| {
@@ -444,6 +475,14 @@ impl AddedVocabulary {
             })
             .expect("AddedVocabulary bad split");
 
+        // ["I read a book", "   <s>", "Hey"] -> ["▁I read a book", "▁   <s>", "▁Hey"]
+        // ["▁I read a book", "▁   <s>", "▁Hey"] -> [.., "▁   ", "<s>", "▁Hey"]
+
+        // </s> normalized = True -> "▁</s>"
+        // "I read a book</s>Hey" -> ["▁I read a book", "<","/","s",">", "Hey"]
+
+        // "I read a " "[DAY]", "book monday" -> "i read a " "[day]", "book monday"
+
         pretokenized
     }
 }
@@ -880,4 +919,66 @@ mod tests {
             ]
         );
     }
+
+    #[test]
+    fn test_encode_special_tokens() {
+        let model = ModelMock::new(&[]);
+        let mut vocab = AddedVocabulary::new();
+        let normalizer = Lowercase;
+
+        vocab.add_tokens(
+            &[
+                AddedToken::from("<mask>", true)
+                    .lstrip(true)
+                    .rstrip(true)
+                    .single_word(true),
+                AddedToken::from("ask>", false),
+                AddedToken::from("<pad>", true),
+            ],
+            &model,
+            Some(&normalizer),
+        );
+        vocab.set_encode_special_tokens(true);
+
+        let result = vocab.extract_and_normalize(
+            Some(&normalizer),
+            "Hi <mask> there\t<mask>\t<mask>\u{2000} <pad> <mask><pad><pad>",
+        );
+
+        assert_eq!(
+            simplify_output(&result),
+            vec![
+                ("hi <m", None),
+                ("ask>", Some(vec![1])),
+                (" there\t<m", None),
+                ("ask>", Some(vec![1])),
+                ("\t<m", None),
+                ("ask>", Some(vec![1])),
+                ("\u{2000} <pad> <m", None),
+                ("ask>", Some(vec![1])),
+                ("<pad><pad>", None)
+            ]
+        );
+
+        vocab.set_encode_special_tokens(false);
+
+        let result = vocab.extract_and_normalize(
+            Some(&normalizer),
+            "Hi <mask> there\t<mask>\t<mask>\u{2000} <pad> <mask><pad><pad>",
+        );
+        assert_eq!(
+            simplify_output(&result),
+            vec![
+                ("hi", None),
+                (" <mask> ", Some(vec![0])),
+                ("there", None),
+                ("\t<mask>\t", Some(vec![0])),
+                ("<mask>\u{2000} ", Some(vec![0])),
+                ("<pad>", Some(vec![2])),
+                (" <mask>", Some(vec![0])),
+                ("<pad>", Some(vec![2])),
+                ("<pad>", Some(vec![2]))
+            ]
+        );
+    }
 }
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -685,6 +685,16 @@ where
         self.added_vocabulary.id_to_token(id, &self.model)
     }
 
+    /// set the added bocab's splitting scheme
+    pub fn set_encode_special_tokens(&mut self, value: bool) {
+        self.added_vocabulary.set_encode_special_tokens(value);
+    }
+
+    /// Get added token value
+    pub fn get_encode_special_tokens(&self) -> bool {
+        self.added_vocabulary.get_encode_special_tokens()
+    }
+
     /// Encode a single sequence
     fn encode_single_sequence(
         &self,