diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index e0462dd7348383..f29d31bfdfac17 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -153,9 +153,9 @@ def __init__( legacy=None, **kwargs, ) -> None: - pad_token = AddedToken(pad_token, rstrip=True, lstrip=True) - unk_token = AddedToken(unk_token, rstrip=True, lstrip=True) - eos_token = AddedToken(eos_token, rstrip=True, lstrip=True) + pad_token = AddedToken(pad_token, rstrip=True, lstrip=True) if isinstance(pad_token, str) else pad_token + unk_token = AddedToken(unk_token, rstrip=True, lstrip=True) if isinstance(pad_token, str) else unk_token + eos_token = AddedToken(eos_token, rstrip=True, lstrip=True) if isinstance(pad_token, str) else eos_token self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs @@ -167,7 +167,9 @@ def __init__( if additional_special_tokens is not None: extra_tokens = [x for x in additional_special_tokens if " 0 and extra_ids != len(extra_tokens): + if len(extra_tokens) < 1: + additional_special_tokens += [f"" for i in range(extra_ids)] + elif extra_ids > 0 and extra_ids != len(extra_tokens): raise ValueError( f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are" " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids" diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 8ac37101453691..6f55b68c3f6df4 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -4110,7 +4110,7 @@ def test_additional_special_tokens_serialization(self): # make sure the token was added self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) - self.assertEqual(new_eos, tokenizer.added_tokens_decoder[self.added_tokens_encoder[str(new_eos)]]) + self.assertEqual(new_eos, tokenizer.added_tokens_decoder[tokenizer.added_tokens_encoder[str(new_eos)]]) # At this point if you save the tokenizer and reload it, the token will be saved as special # it does not matter if you set the attribute