From 32be32328cb854e2587ec62ea92ae8116918e7db Mon Sep 17 00:00:00 2001 From: Arthur Date: Thu, 28 Sep 2023 12:07:49 +0200 Subject: [PATCH 01/12] fix stripping --- src/transformers/tokenization_utils_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 40dc51b80d2c73..f51c3e9fd2de07 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2233,8 +2233,9 @@ def _from_pretrained( with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) # legacy: we have to init with (rstrip=True, lstrip=True) + strip = True if not "Fast" in cls.__name__ else False added_tokens_decoder = { - index: AddedToken(token, rstrip=True, lstrip=True) for token, index in added_tok_encoder.items() + index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items() } # end legacy From cb4e48a36d23d0ce15dd9e81b299b78711044320 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 2 Oct 2023 14:40:07 +0200 Subject: [PATCH 02/12] nits --- src/transformers/tokenization_utils_base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index f51c3e9fd2de07..35e02db9df5134 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2199,6 +2199,11 @@ def _from_pretrained( if isinstance(token, AddedToken): added_tokens_decoder[int(idx)] = token + if str(token) in additional_special_tokens: + # at this point if the token is in `additional_special_tokens` as an str, should be updated + additional_special_tokens.remove(str(token)) + if token.special and token not in additional_special_tokens: + additional_special_tokens.append(token) else: raise ValueError( f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary." From e9bc0e625c08e0cc55bd39da0f96cb6abefebe1c Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 2 Oct 2023 14:48:25 +0200 Subject: [PATCH 03/12] fix another test --- src/transformers/tokenization_utils_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 35e02db9df5134..9519cf1f025d86 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2402,8 +2402,8 @@ def save_pretrained( tokenizer_config = copy.deepcopy(self.init_kwargs) # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers - # target_keys = self.init_kwargs.keys() - target_keys = ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"] + target_keys = list(self.init_kwargs.keys()) + target_keys += ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"] for k in target_keys: if hasattr(self, k): tokenizer_config[k] = getattr(self, k) From fa93ed3e2d2bbc196ec53f2044a15f12bbc421d1 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 2 Oct 2023 14:49:12 +0200 Subject: [PATCH 04/12] styling --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 9519cf1f025d86..138a22da87571e 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2238,7 +2238,7 @@ def _from_pretrained( with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) # legacy: we have to init with (rstrip=True, lstrip=True) - strip = True if not "Fast" in cls.__name__ else False + strip = True if "Fast" not in cls.__name__ else False added_tokens_decoder = { index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items() } From f031f5ef751d0d3114de34d3008f08010d7362a3 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 2 Oct 2023 15:40:15 +0200 Subject: [PATCH 05/12] fix? --- src/transformers/tokenization_utils_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 138a22da87571e..8c35de788502a9 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2237,8 +2237,8 @@ def _from_pretrained( if added_tokens_file is not None: with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) - # legacy: we have to init with (rstrip=True, lstrip=True) - strip = True if "Fast" not in cls.__name__ else False + # legacy: we have to init with (rstrip=True, lstrip=True) (if the token is new? Failing test) + strip = True # if "Fast" not in cls.__name__ and token not in additional_special_tokens else False added_tokens_decoder = { index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items() } From fb80bf933a5ab5a06af41bd8920caa4e517d5f03 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 2 Oct 2023 17:01:18 +0200 Subject: [PATCH 06/12] update --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 020a517e128e86..3a586b8becb09e 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2238,7 +2238,7 @@ def _from_pretrained( with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) # legacy: we have to init with (rstrip=True, lstrip=True) (if the token is new? Failing test) - strip = True # if "Fast" not in cls.__name__ and token not in additional_special_tokens else False + strip = True if "Fast" not in cls.__name__ else False added_tokens_decoder = { index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items() } From a9b8845df640916ee4ee6c2bf4acbec4fa36ccb5 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 2 Oct 2023 17:48:11 +0200 Subject: [PATCH 07/12] revert bad merge --- src/transformers/tokenization_utils_base.py | 28 ++++++--------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 3a586b8becb09e..01fadbcacf765c 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2209,11 +2209,6 @@ def _from_pretrained( f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary." ) else: - logger.warning_once( - "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, " - " it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again." - " You will see the new `added_tokens_decoder` attribute that will store the relevant information." - ) # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified if special_tokens_map_file is not None: with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: @@ -2237,11 +2232,14 @@ def _from_pretrained( if added_tokens_file is not None: with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) - # legacy: we have to init with (rstrip=True, lstrip=True) (if the token is new? Failing test) - strip = True if "Fast" not in cls.__name__ else False - added_tokens_decoder = { - index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items() - } + # legacy: we have to init with (rstrip=True, lstrip=True) + rstrip = lstrip = True if "Fast" not in cls.__name__ else False + for token, index in added_tok_encoder.items(): + if index in added_tokens_decoder and "Fast" not in cls.__name__: + continue + added_tokens_decoder = { + index: AddedToken(token, rstrip=rstrip, lstrip=lstrip) for token, index in added_tok_encoder.items() + } # end legacy # slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved! @@ -2282,16 +2280,6 @@ def _from_pretrained( # uses the information stored in `added_tokens_decoder`. Checks after addition that we have the same ids if init_kwargs.get("slow_to_fast", False): tokenizer.add_tokens([token for _, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])]) - warnings = "" - for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0]): - if tokenizer.convert_tokens_to_ids(str(token)) != index: - warnings += f"\texpected id: {tokenizer.convert_tokens_to_ids(str(token))}, found: {index}, token: `{token}`,\n" - if len(warnings) > 1: - logger.warn( - f"You are converting a {slow_tokenizer.__class__.__name__} to a {cls.__name__}, but" - f" wrong indexes were founds when adding the `added_tokens` from the `slow` tokenizer to the `fast`. " - f" The following tokens had unexpected id :\n{warnings}. You should try using `from_slow`." - ) # finally we add all the special_tokens to make sure eveything is initialized tokenizer.add_tokens(tokenizer.all_special_tokens_extended, special_tokens=True) From 339ce67c7e44c151d2dd7f6c542cf7e255197114 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 2 Oct 2023 19:34:49 +0200 Subject: [PATCH 08/12] found the bug --- src/transformers/tokenization_utils_base.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 01fadbcacf765c..a569f03e9f9476 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -851,6 +851,8 @@ def __init__(self, verbose=True, **kwargs): continue if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key == "additional_special_tokens": + # TODO THIS IS NASTY! Will always reset tokens to default rstrip and lstrip because self.set_attr on strings + # will not check the addedtokens decoder. WILL FIX TOMORROW assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple" assert all( isinstance(t, (str, AddedToken)) for t in value @@ -2196,11 +2198,10 @@ def _from_pretrained( for idx, token in init_kwargs["added_tokens_decoder"].items(): if isinstance(token, dict): token = AddedToken(**token) - if isinstance(token, AddedToken): added_tokens_decoder[int(idx)] = token if str(token) in additional_special_tokens: - # at this point if the token is in `additional_special_tokens` as an str, should be updated + # at this point the token is in `additional_special_tokens` as an str, let's add the AddedToken info additional_special_tokens.remove(str(token)) if token.special and token not in additional_special_tokens: additional_special_tokens.append(token) @@ -2235,11 +2236,9 @@ def _from_pretrained( # legacy: we have to init with (rstrip=True, lstrip=True) rstrip = lstrip = True if "Fast" not in cls.__name__ else False for token, index in added_tok_encoder.items(): - if index in added_tokens_decoder and "Fast" not in cls.__name__: - continue - added_tokens_decoder = { - index: AddedToken(token, rstrip=rstrip, lstrip=lstrip) for token, index in added_tok_encoder.items() - } + if index not in added_tokens_decoder: + rstrip = lstrip = False + added_tokens_decoder = {index: AddedToken(token, rstrip=rstrip, lstrip=lstrip)} # end legacy # slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved! From d093b5cfc866aedf5919376fe2601bdfa41380e5 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 3 Oct 2023 11:34:10 +0200 Subject: [PATCH 09/12] YES SIR --- src/transformers/tokenization_utils.py | 8 ++++++++ src/transformers/tokenization_utils_base.py | 17 +++++++++-------- src/transformers/tokenization_utils_fast.py | 8 ++++++++ 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index fa2902cfc25126..e68633ef139125 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -377,6 +377,14 @@ def added_tokens_decoder(self) -> Dict[int, AddedToken]: """ return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])) + @property + def added_tokens_encoder(self) -> Dict[str, int]: + """ + Returns the sorted mapping from string to index. The added tokens encoder is cached for performance + optimisation in `self._added_tokens_encoder` for the slow tokenizers. + """ + return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])} + @added_tokens_decoder.setter def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]: # Always raise an error if string because users should define the behavior diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index a569f03e9f9476..92ad0a33594a83 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -846,6 +846,7 @@ def __init__(self, verbose=True, **kwargs): # We directly set the hidden value to allow initialization with special tokens # which are not yet in the vocabulary. Necessary for serialization/de-serialization # TODO clean this up at some point (probably by switching to fast tokenizers) + for key, value in kwargs.items(): if value is None: continue @@ -857,6 +858,14 @@ def __init__(self, verbose=True, **kwargs): assert all( isinstance(t, (str, AddedToken)) for t in value ), "One of the tokens is not a string or an AddedToken" + if hasattr(self, "added_tokens_encoder"): + extended_token = [] + for token in value: + if isinstance(token, str) and str(token) in self.added_tokens_encoder: + extended_token.append(self.added_tokens_decoder[self.added_tokens_encoder[str(token)]]) + else: + extended_token.append(token) + value = extended_token setattr(self, key, value) elif isinstance(value, (str)): value = AddedToken(value, normalized=False, special=True) @@ -1676,14 +1685,6 @@ def _set_processor_class(self, processor_class: str): """Sets processor class as an attribute.""" self._processor_class = processor_class - @property - def added_tokens_encoder(self) -> Dict[str, int]: - """ - Returns the sorted mapping from string to index. The added tokens encoder is cached for performance - optimisation in `self._added_tokens_encoder` for the slow tokenizers. - """ - return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])} - @property def added_tokens_decoder(self) -> Dict[int, AddedToken]: raise NotImplementedError() diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 45a6639e1caab8..aadab8262849c7 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -185,6 +185,14 @@ def get_vocab(self) -> Dict[str, int]: def vocab(self) -> Dict[str, int]: return self.get_vocab() + @property + def added_tokens_encoder(self) -> Dict[str, int]: + """ + Returns the sorted mapping from string to index. The added tokens encoder is cached for performance + optimisation in `self._added_tokens_encoder` for the slow tokenizers. + """ + return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])} + @property def added_tokens_decoder(self) -> Dict[int, AddedToken]: """ From c12a2f9071929dbc113d1ea44bbae60064d8cb48 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 3 Oct 2023 11:35:32 +0200 Subject: [PATCH 10/12] is that change really required? --- src/transformers/tokenization_utils_base.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 92ad0a33594a83..1f2cf6e436f3da 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2235,11 +2235,10 @@ def _from_pretrained( with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) # legacy: we have to init with (rstrip=True, lstrip=True) - rstrip = lstrip = True if "Fast" not in cls.__name__ else False - for token, index in added_tok_encoder.items(): - if index not in added_tokens_decoder: - rstrip = lstrip = False - added_tokens_decoder = {index: AddedToken(token, rstrip=rstrip, lstrip=lstrip)} + strip = True if "Fast" not in cls.__name__ else False + added_tokens_decoder = { + index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items() + } # end legacy # slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved! @@ -2389,7 +2388,6 @@ def save_pretrained( tokenizer_config = copy.deepcopy(self.init_kwargs) - # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers target_keys = list(self.init_kwargs.keys()) target_keys += ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"] for k in target_keys: From 02922e1bcc8a82b63f35cf128dc47721d0a23389 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 3 Oct 2023 12:02:28 +0200 Subject: [PATCH 11/12] make fast even faster --- src/transformers/tokenization_utils_fast.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index aadab8262849c7..2c6b3c167fecd4 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -210,10 +210,7 @@ def get_added_vocab(self) -> Dict[str, int]: Returns: `Dict[str, int]`: The added tokens. """ - base_vocab = self._tokenizer.get_vocab(with_added_tokens=False) - full_vocab = self._tokenizer.get_vocab(with_added_tokens=True) - added_vocab = {tok: index for tok, index in full_vocab.items() if tok not in base_vocab} - return added_vocab + return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])} def __len__(self) -> int: """ From 93152be29a99c47eb5dee02e770ccc42f4f6d2c1 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 3 Oct 2023 12:09:20 +0200 Subject: [PATCH 12/12] re order functions --- src/transformers/tokenization_utils.py | 34 +++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index e68633ef139125..2ceed1b46d4899 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -368,14 +368,15 @@ def __init__(self, **kwargs): self._decode_use_source_tokenizer = False @property - def added_tokens_decoder(self) -> Dict[int, AddedToken]: - """ - Returns the added tokens in the vocabulary as a dictionary of index to AddedToken. + def is_fast(self) -> bool: + return False - Returns: - `Dict[str, int]`: The added tokens. + @property + def vocab_size(self) -> int: """ - return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])) + `int`: Size of the base vocabulary (without the added tokens). + """ + raise NotImplementedError @property def added_tokens_encoder(self) -> Dict[str, int]: @@ -385,6 +386,16 @@ def added_tokens_encoder(self) -> Dict[str, int]: """ return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])} + @property + def added_tokens_decoder(self) -> Dict[int, AddedToken]: + """ + Returns the added tokens in the vocabulary as a dictionary of index to AddedToken. + + Returns: + `Dict[str, int]`: The added tokens. + """ + return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])) + @added_tokens_decoder.setter def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]: # Always raise an error if string because users should define the behavior @@ -397,17 +408,6 @@ def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token self._added_tokens_encoder[str(token)] = index - @property - def is_fast(self) -> bool: - return False - - @property - def vocab_size(self) -> int: - """ - `int`: Size of the base vocabulary (without the added tokens). - """ - raise NotImplementedError - def get_added_vocab(self) -> Dict[str, int]: """ Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from