Skip to content

Commit

Permalink
add a test in python
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Jan 19, 2024
1 parent 842eced commit 7fb3c18
Showing 1 changed file with 15 additions and 1 deletion.
16 changes: 15 additions & 1 deletion bindings/python/tests/bindings/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def test_can_set_normalized(self):
assert added_token.single_word == False
assert added_token.normalized == False


class TestTokenizer:
def test_has_expected_type_and_methods(self):
tokenizer = Tokenizer(BPE())
Expand Down Expand Up @@ -457,3 +456,18 @@ def test_unigram_byte_fallback(self):
output = tokenizer.encode("A sentence 🤗")
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]

def test_encode_special_tokens(self):
tokenizer = Tokenizer.from_pretrained("t5-base")
tokenizer.add_tokens(["<eot>"])
tokenizer.add_special_tokens(["<end_of_text>"])
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens ==['▁Hey', '▁there', '<end_of_text>', '▁dear', '<eot>', '▁friend', '!']

tokenizer.set_encode_special_tokens(True)
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == ['▁Hey', '▁there', '<', 'end', '_', 'of', '_', 'text', '>', '▁dear', '<eot>', '▁friend', '!']

tokenizer.add_tokens(["of_text>"])
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == ['▁Hey', '▁there', '<', 'end', '_', 'of_text>', '▁dear', '<eot>', '▁friend', '!']

0 comments on commit 7fb3c18

Please sign in to comment.