diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py index 3faeccd2500cc8..3bc1726876e819 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py +++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py @@ -145,6 +145,8 @@ def __init__( from_slow = kwargs.pop("from_slow", None) from_slow = from_slow or str(pad_token) != "" or str(eos_token) != "" or str(unk_token) != "" + kwargs.pop("added_tokens_decoder", {}) + super().__init__( vocab_file, tokenizer_file=tokenizer_file, diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index d868dd00adddc1..b9bc0ec54b01fc 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2235,7 +2235,7 @@ def _from_pretrained( # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer # if `tokenizer_config.json` is `None` - if "Fast" not in cls.__name__ and tokenizer_file is not None: + if tokenizer_file is not None: # This is for slow so can be done before with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle: tokenizer_file_handle = json.load(tokenizer_file_handle) @@ -2247,14 +2247,14 @@ def _from_pretrained( # end legacy # Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken + # convert {'__type': 'AddedToken', 'content': '', 'lstrip': False, 'normalized': True, ...} to AddedTokens + init_kwargs["added_tokens_decoder"] = added_tokens_decoder + init_kwargs = cls.convert_added_tokens(init_kwargs, save=False) for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys(): if added_tokens_map != {} and init_kwargs[key] is not None: if key != "additional_special_tokens": - init_kwargs[key] = added_tokens_map.get(init_kwargs[key], init_kwargs[key]) + init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key]) - init_kwargs["added_tokens_decoder"] = added_tokens_decoder - # convert {'__type': 'AddedToken', 'content': '', 'lstrip': False, 'normalized': True, ...} to AddedTokens - init_kwargs = cls.convert_added_tokens(init_kwargs, save=False) # Instantiate the tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs)