Skip to content

Commit

Permalink
Merge branch 'main' of github.com:huggingface/tokenizers into branch_…
Browse files Browse the repository at this point in the history
…v0.15.1.rc1
  • Loading branch information
ArthurZucker committed Jan 19, 2024
2 parents 7fafaf7 + 6a77d48 commit f387e2a
Show file tree
Hide file tree
Showing 13 changed files with 197 additions and 22 deletions.
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,5 @@ print(output.tokens)
# ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
```

Check the [python documentation](https://huggingface.co/docs/tokenizers/index) or the

[python quicktour](https://huggingface.co/docs/tokenizers/python/latest/quicktour.html) to learn
more!
Check the [documentation](https://huggingface.co/docs/tokenizers/index)
or the [quicktour](https://huggingface.co/docs/tokenizers/quicktour) to learn more!
6 changes: 3 additions & 3 deletions bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
serde_json = "1.0"
libc = "0.2"
env_logger = "0.10.0"
pyo3 = { version = "0.19" }
numpy = "0.19.0"
pyo3 = { version = "0.20" }
numpy = "0.20.0"
ndarray = "0.15"
onig = { version = "6.4", default-features = false }
itertools = "0.11"
Expand All @@ -26,7 +26,7 @@ path = "../../tokenizers"

[dev-dependencies]
tempfile = "3.8"
pyo3 = { version = "0.19", features = ["auto-initialize"] }
pyo3 = { version = "0.20", features = ["auto-initialize"] }

[features]
defaut = ["pyo3/extension-module"]
12 changes: 12 additions & 0 deletions bindings/python/py_src/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,18 @@ class Tokenizer:
Returns:
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
"""
pass
@property
def encode_special_tokens(self):
"""
Modifies the tokenizer in order to use or not the special tokens
during encoding.
Args:
value (:obj:`bool`):
Whether to use the special tokens or not
"""
pass
@staticmethod
Expand Down
6 changes: 3 additions & 3 deletions bindings/python/src/processors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -266,15 +266,15 @@ impl FromPyObject<'_> for PySpecialToken {
Ok(Self(v.into()))
} else if let Ok(d) = ob.downcast::<PyDict>() {
let id = d
.get_item("id")
.get_item("id")?
.ok_or_else(|| exceptions::PyValueError::new_err("`id` must be specified"))?
.extract::<String>()?;
let ids = d
.get_item("ids")
.get_item("ids")?
.ok_or_else(|| exceptions::PyValueError::new_err("`ids` must be specified"))?
.extract::<Vec<u32>>()?;
let tokens = d
.get_item("tokens")
.get_item("tokens")?
.ok_or_else(|| exceptions::PyValueError::new_err("`tokens` must be specified"))?
.extract::<Vec<String>>()?;

Expand Down
20 changes: 19 additions & 1 deletion bindings/python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ use pyo3::exceptions;
use pyo3::intern;
use pyo3::prelude::*;
use pyo3::types::*;
use pyo3::AsPyPointer;
use tk::models::bpe::BPE;
use tk::tokenizer::{
Model, PaddingDirection, PaddingParams, PaddingStrategy, PostProcessor, TokenizerImpl,
Expand Down Expand Up @@ -1110,6 +1109,25 @@ impl PyTokenizer {
self.tokenizer.id_to_token(id)
}

/// Modifies the tokenizer in order to use or not the special tokens
/// during encoding.
///
/// Args:
/// value (:obj:`bool`):
/// Whether to use the special tokens or not
///
#[setter]
fn set_encode_special_tokens(&mut self, value: bool) {
self.tokenizer.set_encode_special_tokens(value);
}
/// Get the value of the `encode_special_tokens` attribute
///
/// Returns:
/// :obj:`bool`: the tokenizer's encode_special_tokens attribute
#[getter]
fn get_encode_special_tokens(&self) -> bool {
self.tokenizer.get_encode_special_tokens()
}
/// Add the given tokens to the vocabulary
///
/// The given tokens are added only if they don't already exist in the vocabulary.
Expand Down
1 change: 0 additions & 1 deletion bindings/python/src/utils/iterators.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use pyo3::prelude::*;
use pyo3::AsPyPointer;
use std::collections::VecDeque;

/// An simple iterator that can be instantiated with a specified length.
Expand Down
31 changes: 31 additions & 0 deletions bindings/python/tests/bindings/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,3 +457,34 @@ def test_unigram_byte_fallback(self):
output = tokenizer.encode("A sentence 🤗")
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]

def test_encode_special_tokens(self):
tokenizer = Tokenizer.from_pretrained("t5-base")
tokenizer.add_tokens(["<eot>"])
tokenizer.add_special_tokens(["<end_of_text>"])
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == ["▁Hey", "▁there", "<end_of_text>", "▁dear", "<eot>", "▁friend", "!"]

tokenizer.encode_special_tokens = True
assert tokenizer.encode_special_tokens == True

output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == [
"▁Hey",
"▁there",
"<",
"end",
"_",
"of",
"_",
"text",
">",
"▁dear",
"<eot>",
"▁friend",
"!",
]

tokenizer.add_tokens(["of_text>"])
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ def test_datasets(self):

# START def_batch_iterator
def batch_iterator(batch_size=1000):
for i in range(0, len(dataset), batch_size):
yield dataset[i : i + batch_size]["text"]
# Only keep the text column to avoid decoding the rest of the columns unnecessarily
tok_dataset = dataset.select_columns("text")
for batch in tok_dataset.iter(batch_size):
yield batch["text"]

# END def_batch_iterator

Expand Down
4 changes: 2 additions & 2 deletions tokenizers/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ TESTS_DIR = tests

dir_guard=@mkdir -p $(@D)

SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json

.PHONY : build
build :
Expand Down
12 changes: 6 additions & 6 deletions tokenizers/examples/unstable_wasm/www/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions tokenizers/src/pre_tokenizers/byte_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ use crate::tokenizer::{
};
use crate::utils::macro_rules_attribute;

/// Converts bytes to unicode characters.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
fn bytes_char() -> HashMap<u8, char> {
let mut bs: Vec<u8> = vec![];
bs.extend(b'!'..=b'~');
Expand All @@ -33,6 +35,8 @@ fn bytes_char() -> HashMap<u8, char> {
}

lazy_static! {
/// Regex that matches exactly one token.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
static ref RE: SysRegex = SysRegex::new(
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
)
Expand Down
101 changes: 101 additions & 0 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ pub(super) struct AddedVocabulary {
split_trie: MatchingSet,
/// A RegexSet containing all the normalized patterns used to split on AddedTokens
split_normalized_trie: MatchingSet,

/// Whether or not special tokens should be splitted when encoding. This is equivalent to ignoring them
encode_special_tokens: bool,
}

impl AddedVocabulary {
Expand All @@ -180,6 +183,7 @@ impl AddedVocabulary {
special_tokens_set: HashSet::new(),
split_trie: (trie, vec![]),
split_normalized_trie: (normalized_trie, vec![]),
encode_special_tokens: false,
}
}
/// Size of the additional vocabulary
Expand Down Expand Up @@ -214,6 +218,15 @@ impl AddedVocabulary {
.or_else(|| model.id_to_token(id))
}

//
pub fn set_encode_special_tokens(&mut self, value: bool) {
self.encode_special_tokens = value;
}

pub fn get_encode_special_tokens(&self) -> bool {
self.encode_special_tokens
}

/// Check if a token is a special token
pub fn is_special_token(&self, token: &str) -> bool {
self.special_tokens_set.contains(token)
Expand Down Expand Up @@ -356,6 +369,12 @@ impl AddedVocabulary {
let aho_id = mat.pattern();
let id = split_re.1[aho_id];
let added_token = &self.added_tokens_map_r.get(&id).unwrap();

if self.encode_special_tokens && self.special_tokens_set.contains(&added_token.content)
{
continue;
}

if added_token.single_word {
let start_space = start == 0 || !ends_with_word(&sentence[..start]);
let stop_space = stop == sentence.len() || !starts_with_word(&sentence[stop..]);
Expand Down Expand Up @@ -436,6 +455,18 @@ impl AddedVocabulary {
.split(|_, sequence| Ok(self.split_with_indices(sequence, &self.split_trie)))
.expect("AddedVocabulary bad split");

// <s> normalized = False
// "I read a book <s>Hey" -> "I read a book", " <s>", "Hey"

// </s> normalized = True -> "▁</s>"
// "I read a book</s>Hey" -> "I read a book</s>Hey"

// Day normalized = True -> "Day"
// "I read a book monday" -> "I read a book monday"

// [DAY] normalized = False -> "Day"
// "I read a [DAY] monday" -> "I read a " "[DAY]", "book monday"
// 320055
// 2. Then extract the normalized tokens from the normalized pieces of the string
pretokenized
.split(|_, mut sequence| {
Expand All @@ -444,6 +475,14 @@ impl AddedVocabulary {
})
.expect("AddedVocabulary bad split");

// ["I read a book", " <s>", "Hey"] -> ["▁I read a book", "▁ <s>", "▁Hey"]
// ["▁I read a book", "▁ <s>", "▁Hey"] -> [.., "▁ ", "<s>", "▁Hey"]

// </s> normalized = True -> "▁</s>"
// "I read a book</s>Hey" -> ["▁I read a book", "<","/","s",">", "Hey"]

// "I read a " "[DAY]", "book monday" -> "i read a " "[day]", "book monday"

pretokenized
}
}
Expand Down Expand Up @@ -880,4 +919,66 @@ mod tests {
]
);
}

#[test]
fn test_encode_special_tokens() {
let model = ModelMock::new(&[]);
let mut vocab = AddedVocabulary::new();
let normalizer = Lowercase;

vocab.add_tokens(
&[
AddedToken::from("<mask>", true)
.lstrip(true)
.rstrip(true)
.single_word(true),
AddedToken::from("ask>", false),
AddedToken::from("<pad>", true),
],
&model,
Some(&normalizer),
);
vocab.set_encode_special_tokens(true);

let result = vocab.extract_and_normalize(
Some(&normalizer),
"Hi <mask> there\t<mask>\t<mask>\u{2000} <pad> <mask><pad><pad>",
);

assert_eq!(
simplify_output(&result),
vec![
("hi <m", None),
("ask>", Some(vec![1])),
(" there\t<m", None),
("ask>", Some(vec![1])),
("\t<m", None),
("ask>", Some(vec![1])),
("\u{2000} <pad> <m", None),
("ask>", Some(vec![1])),
("<pad><pad>", None)
]
);

vocab.set_encode_special_tokens(false);

let result = vocab.extract_and_normalize(
Some(&normalizer),
"Hi <mask> there\t<mask>\t<mask>\u{2000} <pad> <mask><pad><pad>",
);
assert_eq!(
simplify_output(&result),
vec![
("hi", None),
(" <mask> ", Some(vec![0])),
("there", None),
("\t<mask>\t", Some(vec![0])),
("<mask>\u{2000} ", Some(vec![0])),
("<pad>", Some(vec![2])),
(" <mask>", Some(vec![0])),
("<pad>", Some(vec![2])),
("<pad>", Some(vec![2]))
]
);
}
}
10 changes: 10 additions & 0 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,16 @@ where
self.added_vocabulary.id_to_token(id, &self.model)
}

/// set the added bocab's splitting scheme
pub fn set_encode_special_tokens(&mut self, value: bool) {
self.added_vocabulary.set_encode_special_tokens(value);
}

/// Get added token value
pub fn get_encode_special_tokens(&self) -> bool {
self.added_vocabulary.get_encode_special_tokens()
}

/// Encode a single sequence
fn encode_single_sequence(
&self,
Expand Down

0 comments on commit f387e2a

Please sign in to comment.