diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index a1e41c283..f2099ab4e 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -64,7 +64,6 @@ def test_can_set_normalized(self): assert added_token.single_word == False assert added_token.normalized == False - class TestTokenizer: def test_has_expected_type_and_methods(self): tokenizer = Tokenizer(BPE()) @@ -457,3 +456,18 @@ def test_unigram_byte_fallback(self): output = tokenizer.encode("A sentence 🤗") assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9] assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"] + + def test_encode_special_tokens(self): + tokenizer = Tokenizer.from_pretrained("t5-base") + tokenizer.add_tokens([""]) + tokenizer.add_special_tokens([""]) + output = tokenizer.encode("Hey there dearfriend!", add_special_tokens=False) + assert output.tokens ==['▁Hey', '▁there', '', '▁dear', '', '▁friend', '!'] + + tokenizer.set_encode_special_tokens(True) + output = tokenizer.encode("Hey there dearfriend!", add_special_tokens=False) + assert output.tokens == ['▁Hey', '▁there', '<', 'end', '_', 'of', '_', 'text', '>', '▁dear', '', '▁friend', '!'] + + tokenizer.add_tokens(["of_text>"]) + output = tokenizer.encode("Hey there dearfriend!", add_special_tokens=False) + assert output.tokens == ['▁Hey', '▁there', '<', 'end', '_', 'of_text>', '▁dear', '', '▁friend', '!'] \ No newline at end of file