Skip to content

Commit

Permalink
[Tokenizer Serialization] Fix the broken serialisation (#27099)
Browse files Browse the repository at this point in the history
* nits

* nits

* actual fix

* style

* ze fix

* fix fix fix style
  • Loading branch information
ArthurZucker authored Dec 13, 2023
1 parent f4db565 commit 230ac35
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 5 deletions.
2 changes: 2 additions & 0 deletions src/transformers/models/pegasus/tokenization_pegasus_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ def __init__(
from_slow = kwargs.pop("from_slow", None)
from_slow = from_slow or str(pad_token) != "<pad>" or str(eos_token) != "</s>" or str(unk_token) != "<unk>"

kwargs.pop("added_tokens_decoder", {})

super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
Expand Down
10 changes: 5 additions & 5 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2235,7 +2235,7 @@ def _from_pretrained(

# allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
# if `tokenizer_config.json` is `None`
if "Fast" not in cls.__name__ and tokenizer_file is not None:
if tokenizer_file is not None:
# This is for slow so can be done before
with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
tokenizer_file_handle = json.load(tokenizer_file_handle)
Expand All @@ -2247,14 +2247,14 @@ def _from_pretrained(
# end legacy

# Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken
# convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
init_kwargs["added_tokens_decoder"] = added_tokens_decoder
init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
if added_tokens_map != {} and init_kwargs[key] is not None:
if key != "additional_special_tokens":
init_kwargs[key] = added_tokens_map.get(init_kwargs[key], init_kwargs[key])
init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])

init_kwargs["added_tokens_decoder"] = added_tokens_decoder
# convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
# Instantiate the tokenizer.
try:
tokenizer = cls(*init_inputs, **init_kwargs)
Expand Down

0 comments on commit 230ac35

Please sign in to comment.