diff --git a/README.md b/README.md index 52d523dff..bac865508 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,5 @@ print(output.tokens) # ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"] ``` -Check the [python documentation](https://huggingface.co/docs/tokenizers/index) or the - -[python quicktour](https://huggingface.co/docs/tokenizers/python/latest/quicktour.html) to learn -more! +Check the [documentation](https://huggingface.co/docs/tokenizers/index) +or the [quicktour](https://huggingface.co/docs/tokenizers/quicktour) to learn more! diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index f1d20aa1a..26bb5ef7f 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]} serde_json = "1.0" libc = "0.2" env_logger = "0.10.0" -pyo3 = { version = "0.19" } -numpy = "0.19.0" +pyo3 = { version = "0.20" } +numpy = "0.20.0" ndarray = "0.15" onig = { version = "6.4", default-features = false } itertools = "0.11" @@ -26,7 +26,7 @@ path = "../../tokenizers" [dev-dependencies] tempfile = "3.8" -pyo3 = { version = "0.19", features = ["auto-initialize"] } +pyo3 = { version = "0.20", features = ["auto-initialize"] } [features] defaut = ["pyo3/extension-module"] diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 4b80a7f75..7c21c5b56 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -836,6 +836,18 @@ class Tokenizer: Returns: A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch + """ + pass + @property + def encode_special_tokens(self): + """ + Modifies the tokenizer in order to use or not the special tokens + during encoding. + + Args: + value (:obj:`bool`): + Whether to use the special tokens or not + """ pass @staticmethod diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 3600deec5..55a69287e 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -266,15 +266,15 @@ impl FromPyObject<'_> for PySpecialToken { Ok(Self(v.into())) } else if let Ok(d) = ob.downcast::() { let id = d - .get_item("id") + .get_item("id")? .ok_or_else(|| exceptions::PyValueError::new_err("`id` must be specified"))? .extract::()?; let ids = d - .get_item("ids") + .get_item("ids")? .ok_or_else(|| exceptions::PyValueError::new_err("`ids` must be specified"))? .extract::>()?; let tokens = d - .get_item("tokens") + .get_item("tokens")? .ok_or_else(|| exceptions::PyValueError::new_err("`tokens` must be specified"))? .extract::>()?; diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 25e93159d..4e792ef54 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -7,7 +7,6 @@ use pyo3::exceptions; use pyo3::intern; use pyo3::prelude::*; use pyo3::types::*; -use pyo3::AsPyPointer; use tk::models::bpe::BPE; use tk::tokenizer::{ Model, PaddingDirection, PaddingParams, PaddingStrategy, PostProcessor, TokenizerImpl, @@ -1110,6 +1109,25 @@ impl PyTokenizer { self.tokenizer.id_to_token(id) } + /// Modifies the tokenizer in order to use or not the special tokens + /// during encoding. + /// + /// Args: + /// value (:obj:`bool`): + /// Whether to use the special tokens or not + /// + #[setter] + fn set_encode_special_tokens(&mut self, value: bool) { + self.tokenizer.set_encode_special_tokens(value); + } + /// Get the value of the `encode_special_tokens` attribute + /// + /// Returns: + /// :obj:`bool`: the tokenizer's encode_special_tokens attribute + #[getter] + fn get_encode_special_tokens(&self) -> bool { + self.tokenizer.get_encode_special_tokens() + } /// Add the given tokens to the vocabulary /// /// The given tokens are added only if they don't already exist in the vocabulary. diff --git a/bindings/python/src/utils/iterators.rs b/bindings/python/src/utils/iterators.rs index cf6310b4d..d19a54eb5 100644 --- a/bindings/python/src/utils/iterators.rs +++ b/bindings/python/src/utils/iterators.rs @@ -1,5 +1,4 @@ use pyo3::prelude::*; -use pyo3::AsPyPointer; use std::collections::VecDeque; /// An simple iterator that can be instantiated with a specified length. diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index a1e41c283..2eb5ce59c 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -457,3 +457,34 @@ def test_unigram_byte_fallback(self): output = tokenizer.encode("A sentence 🤗") assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9] assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"] + + def test_encode_special_tokens(self): + tokenizer = Tokenizer.from_pretrained("t5-base") + tokenizer.add_tokens([""]) + tokenizer.add_special_tokens([""]) + output = tokenizer.encode("Hey there dearfriend!", add_special_tokens=False) + assert output.tokens == ["▁Hey", "▁there", "", "▁dear", "", "▁friend", "!"] + + tokenizer.encode_special_tokens = True + assert tokenizer.encode_special_tokens == True + + output = tokenizer.encode("Hey there dearfriend!", add_special_tokens=False) + assert output.tokens == [ + "▁Hey", + "▁there", + "<", + "end", + "_", + "of", + "_", + "text", + ">", + "▁dear", + "", + "▁friend", + "!", + ] + + tokenizer.add_tokens(["of_text>"]) + output = tokenizer.encode("Hey there dearfriend!", add_special_tokens=False) + assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "", "▁friend", "!"] diff --git a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py index bba55a48b..58d93351d 100644 --- a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py +++ b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py @@ -70,8 +70,10 @@ def test_datasets(self): # START def_batch_iterator def batch_iterator(batch_size=1000): - for i in range(0, len(dataset), batch_size): - yield dataset[i : i + batch_size]["text"] + # Only keep the text column to avoid decoding the rest of the columns unnecessarily + tok_dataset = dataset.select_columns("text") + for batch in tok_dataset.iter(batch_size): + yield batch["text"] # END def_batch_iterator diff --git a/tokenizers/Makefile b/tokenizers/Makefile index 486f5a568..a407afffc 100644 --- a/tokenizers/Makefile +++ b/tokenizers/Makefile @@ -4,9 +4,9 @@ TESTS_DIR = tests dir_guard=@mkdir -p $(@D) -SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt +SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json BENCHMARK_RESOURCES = $(SHARED_RESOURCES) -TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json +TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json .PHONY : build build : diff --git a/tokenizers/examples/unstable_wasm/www/package-lock.json b/tokenizers/examples/unstable_wasm/www/package-lock.json index 042b83eac..8ca2c7b3a 100644 --- a/tokenizers/examples/unstable_wasm/www/package-lock.json +++ b/tokenizers/examples/unstable_wasm/www/package-lock.json @@ -1417,9 +1417,9 @@ } }, "node_modules/follow-redirects": { - "version": "1.15.1", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.1.tgz", - "integrity": "sha512-yLAMQs+k0b2m7cVxpS1VKJVvoz7SS9Td1zss3XRwXj+ZDH00RJgnuLx7E44wx02kQLrdM3aOOy+FpzS7+8OizA==", + "version": "1.15.4", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", + "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", "dev": true, "funding": [ { @@ -4751,9 +4751,9 @@ } }, "follow-redirects": { - "version": "1.15.1", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.1.tgz", - "integrity": "sha512-yLAMQs+k0b2m7cVxpS1VKJVvoz7SS9Td1zss3XRwXj+ZDH00RJgnuLx7E44wx02kQLrdM3aOOy+FpzS7+8OizA==", + "version": "1.15.4", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", + "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", "dev": true }, "forwarded": { diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 03fb4b405..6343bbd07 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -9,6 +9,8 @@ use crate::tokenizer::{ }; use crate::utils::macro_rules_attribute; +/// Converts bytes to unicode characters. +/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9 fn bytes_char() -> HashMap { let mut bs: Vec = vec![]; bs.extend(b'!'..=b'~'); @@ -33,6 +35,8 @@ fn bytes_char() -> HashMap { } lazy_static! { + /// Regex that matches exactly one token. + /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98 static ref RE: SysRegex = SysRegex::new( r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 6870b7ec7..487fb4479 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -160,6 +160,9 @@ pub(super) struct AddedVocabulary { split_trie: MatchingSet, /// A RegexSet containing all the normalized patterns used to split on AddedTokens split_normalized_trie: MatchingSet, + + /// Whether or not special tokens should be splitted when encoding. This is equivalent to ignoring them + encode_special_tokens: bool, } impl AddedVocabulary { @@ -180,6 +183,7 @@ impl AddedVocabulary { special_tokens_set: HashSet::new(), split_trie: (trie, vec![]), split_normalized_trie: (normalized_trie, vec![]), + encode_special_tokens: false, } } /// Size of the additional vocabulary @@ -214,6 +218,15 @@ impl AddedVocabulary { .or_else(|| model.id_to_token(id)) } + // + pub fn set_encode_special_tokens(&mut self, value: bool) { + self.encode_special_tokens = value; + } + + pub fn get_encode_special_tokens(&self) -> bool { + self.encode_special_tokens + } + /// Check if a token is a special token pub fn is_special_token(&self, token: &str) -> bool { self.special_tokens_set.contains(token) @@ -356,6 +369,12 @@ impl AddedVocabulary { let aho_id = mat.pattern(); let id = split_re.1[aho_id]; let added_token = &self.added_tokens_map_r.get(&id).unwrap(); + + if self.encode_special_tokens && self.special_tokens_set.contains(&added_token.content) + { + continue; + } + if added_token.single_word { let start_space = start == 0 || !ends_with_word(&sentence[..start]); let stop_space = stop == sentence.len() || !starts_with_word(&sentence[stop..]); @@ -436,6 +455,18 @@ impl AddedVocabulary { .split(|_, sequence| Ok(self.split_with_indices(sequence, &self.split_trie))) .expect("AddedVocabulary bad split"); + // normalized = False + // "I read a book Hey" -> "I read a book", " ", "Hey" + + // normalized = True -> "▁" + // "I read a bookHey" -> "I read a bookHey" + + // Day normalized = True -> "Day" + // "I read a book monday" -> "I read a book monday" + + // [DAY] normalized = False -> "Day" + // "I read a [DAY] monday" -> "I read a " "[DAY]", "book monday" + // 320055 // 2. Then extract the normalized tokens from the normalized pieces of the string pretokenized .split(|_, mut sequence| { @@ -444,6 +475,14 @@ impl AddedVocabulary { }) .expect("AddedVocabulary bad split"); + // ["I read a book", " ", "Hey"] -> ["▁I read a book", "▁ ", "▁Hey"] + // ["▁I read a book", "▁ ", "▁Hey"] -> [.., "▁ ", "", "▁Hey"] + + // normalized = True -> "▁" + // "I read a bookHey" -> ["▁I read a book", "<","/","s",">", "Hey"] + + // "I read a " "[DAY]", "book monday" -> "i read a " "[day]", "book monday" + pretokenized } } @@ -880,4 +919,66 @@ mod tests { ] ); } + + #[test] + fn test_encode_special_tokens() { + let model = ModelMock::new(&[]); + let mut vocab = AddedVocabulary::new(); + let normalizer = Lowercase; + + vocab.add_tokens( + &[ + AddedToken::from("", true) + .lstrip(true) + .rstrip(true) + .single_word(true), + AddedToken::from("ask>", false), + AddedToken::from("", true), + ], + &model, + Some(&normalizer), + ); + vocab.set_encode_special_tokens(true); + + let result = vocab.extract_and_normalize( + Some(&normalizer), + "Hi there\t\t\u{2000} ", + ); + + assert_eq!( + simplify_output(&result), + vec![ + ("hi ", Some(vec![1])), + (" there\t", Some(vec![1])), + ("\t", Some(vec![1])), + ("\u{2000} ", Some(vec![1])), + ("", None) + ] + ); + + vocab.set_encode_special_tokens(false); + + let result = vocab.extract_and_normalize( + Some(&normalizer), + "Hi there\t\t\u{2000} ", + ); + assert_eq!( + simplify_output(&result), + vec![ + ("hi", None), + (" ", Some(vec![0])), + ("there", None), + ("\t\t", Some(vec![0])), + ("\u{2000} ", Some(vec![0])), + ("", Some(vec![2])), + (" ", Some(vec![0])), + ("", Some(vec![2])), + ("", Some(vec![2])) + ] + ); + } } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 2d7e10f73..ae6a64362 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -685,6 +685,16 @@ where self.added_vocabulary.id_to_token(id, &self.model) } + /// set the added bocab's splitting scheme + pub fn set_encode_special_tokens(&mut self, value: bool) { + self.added_vocabulary.set_encode_special_tokens(value); + } + + /// Get added token value + pub fn get_encode_special_tokens(&self) -> bool { + self.added_vocabulary.get_encode_special_tokens() + } + /// Encode a single sequence fn encode_single_sequence( &self,