-
Notifications
You must be signed in to change notification settings - Fork 27.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Nit-added-tokens #26538
Nit-added-tokens #26538
Changes from 9 commits
32be323
603d4e9
cb4e48a
e9bc0e6
fa93ed3
f031f5e
ec9530b
fb80bf9
2260c85
a9b8845
3807fcb
339ce67
d093b5c
c12a2f9
02922e1
93152be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2199,6 +2199,11 @@ def _from_pretrained( | |
|
||
if isinstance(token, AddedToken): | ||
added_tokens_decoder[int(idx)] = token | ||
if str(token) in additional_special_tokens: | ||
# at this point if the token is in `additional_special_tokens` as an str, should be updated | ||
additional_special_tokens.remove(str(token)) | ||
if token.special and token not in additional_special_tokens: | ||
additional_special_tokens.append(token) | ||
else: | ||
raise ValueError( | ||
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary." | ||
|
@@ -2227,7 +2232,7 @@ def _from_pretrained( | |
if added_tokens_file is not None: | ||
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: | ||
added_tok_encoder = json.load(added_tokens_handle) | ||
# legacy: we have to init with (rstrip=True, lstrip=True) | ||
# legacy: we have to init with (rstrip=True, lstrip=True) (if the token is new? Failing test) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might have to update this. The tests are shitty and the default is biting us |
||
strip = True if "Fast" not in cls.__name__ else False | ||
added_tokens_decoder = { | ||
index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items() | ||
|
@@ -2382,8 +2387,8 @@ def save_pretrained( | |
tokenizer_config = copy.deepcopy(self.init_kwargs) | ||
|
||
# TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers | ||
# target_keys = self.init_kwargs.keys() | ||
target_keys = ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"] | ||
target_keys = list(self.init_kwargs.keys()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when saving; we should overwrite the init_kwargs with the content of self. Don't know why it was not the case before |
||
target_keys += ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"] | ||
for k in target_keys: | ||
if hasattr(self, k): | ||
tokenizer_config[k] = getattr(self, k) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Only use the default legacy values for AddedToken if the token is not already in the added tokens decoder