Skip to content

Commit

Permalink
Added few test cases in test_encoder_decoder round trip and also modi…
Browse files Browse the repository at this point in the history
…fied slow tokenizer of rembert to have mask_token as AddedToken with lstrip = True
  • Loading branch information
nileshkokane01 committed Nov 30, 2023
1 parent 99deea6 commit 8a1c9ea
Showing 1 changed file with 84 additions and 2 deletions.
86 changes: 84 additions & 2 deletions tests/models/rembert/test_tokenization_rembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,90 @@ def test_encode_decode_round_trip(self):
self.assertListEqual(tokens, ["▁", "清水寺は京都にある。"])
encoded_string = tokenizer.encode(text)
self.assertListEqual(encoded_string, [1000, 7, 0, 1001])
text_decode = tokenizer.convert_tokens_to_string(tokens)
self.assertEquals(text_decode, text)
decode_text = tokenizer.convert_tokens_to_string(tokens)
self.assertEquals(decode_text, text)

text = "That's awesome! 🤩 #HuggingFace, 🌟 Have a great day! 🌈"
tokens = tokenizer.tokenize(text)
self.assertListEqual(
tokens,
[
"▁That",
"'",
"s",
"▁a",
"w",
"es",
"ome",
"!",
"▁",
"🤩",
"▁",
"#",
"H",
"u",
"g",
"g",
"ing",
"F",
"a",
"ce",
",",
"▁",
"🌟",
"▁H",
"a",
"ve",
"▁a",
"▁great",
"▁day",
"!",
"▁",
"🌈",
],
)
encoded_string = tokenizer.encode(text)
self.assertListEqual(
encoded_string,
[
1000,
572,
32,
6,
10,
64,
110,
505,
146,
7,
0,
7,
0,
262,
51,
42,
42,
17,
365,
18,
133,
3,
7,
0,
369,
18,
123,
10,
590,
314,
146,
7,
0,
1001,
],
)
decode_text = tokenizer.convert_tokens_to_string(tokens)
self.assertEquals(decode_text, "That's awesome! 🤩 #HuggingFace, 🌟 Have a great day! 🌈")

text = "In the sky up above"
tokens = tokenizer._tokenize(text)
Expand Down

0 comments on commit 8a1c9ea

Please sign in to comment.