From 32be32328cb854e2587ec62ea92ae8116918e7db Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Thu, 28 Sep 2023 12:07:49 +0200
Subject: [PATCH 01/12] fix stripping

---
 src/transformers/tokenization_utils_base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 40dc51b80d2c73..f51c3e9fd2de07 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2233,8 +2233,9 @@ def _from_pretrained(
                 with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                     added_tok_encoder = json.load(added_tokens_handle)
                 # legacy: we have to init with (rstrip=True, lstrip=True)
+                strip = True if not "Fast" in cls.__name__ else False
                 added_tokens_decoder = {
-                    index: AddedToken(token, rstrip=True, lstrip=True) for token, index in added_tok_encoder.items()
+                    index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items()
                 }
             # end legacy
 

From cb4e48a36d23d0ce15dd9e81b299b78711044320 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Mon, 2 Oct 2023 14:40:07 +0200
Subject: [PATCH 02/12] nits

---
 src/transformers/tokenization_utils_base.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index f51c3e9fd2de07..35e02db9df5134 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2199,6 +2199,11 @@ def _from_pretrained(
 
                 if isinstance(token, AddedToken):
                     added_tokens_decoder[int(idx)] = token
+                    if str(token) in additional_special_tokens:
+                        # at this point if the token is in `additional_special_tokens` as an str, should be updated
+                        additional_special_tokens.remove(str(token))
+                    if token.special and token not in additional_special_tokens:
+                        additional_special_tokens.append(token)
                 else:
                     raise ValueError(
                         f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary."

From e9bc0e625c08e0cc55bd39da0f96cb6abefebe1c Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Mon, 2 Oct 2023 14:48:25 +0200
Subject: [PATCH 03/12] fix another test

---
 src/transformers/tokenization_utils_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 35e02db9df5134..9519cf1f025d86 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2402,8 +2402,8 @@ def save_pretrained(
         tokenizer_config = copy.deepcopy(self.init_kwargs)
 
         # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
-        # target_keys = self.init_kwargs.keys()
-        target_keys = ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"]
+        target_keys = list(self.init_kwargs.keys())
+        target_keys += ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"]
         for k in target_keys:
             if hasattr(self, k):
                 tokenizer_config[k] = getattr(self, k)

From fa93ed3e2d2bbc196ec53f2044a15f12bbc421d1 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Mon, 2 Oct 2023 14:49:12 +0200
Subject: [PATCH 04/12] styling

---
 src/transformers/tokenization_utils_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 9519cf1f025d86..138a22da87571e 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2238,7 +2238,7 @@ def _from_pretrained(
                 with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                     added_tok_encoder = json.load(added_tokens_handle)
                 # legacy: we have to init with (rstrip=True, lstrip=True)
-                strip = True if not "Fast" in cls.__name__ else False
+                strip = True if "Fast" not in cls.__name__ else False
                 added_tokens_decoder = {
                     index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items()
                 }

From f031f5ef751d0d3114de34d3008f08010d7362a3 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Mon, 2 Oct 2023 15:40:15 +0200
Subject: [PATCH 05/12] fix?

---
 src/transformers/tokenization_utils_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 138a22da87571e..8c35de788502a9 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2237,8 +2237,8 @@ def _from_pretrained(
             if added_tokens_file is not None:
                 with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                     added_tok_encoder = json.load(added_tokens_handle)
-                # legacy: we have to init with (rstrip=True, lstrip=True)
-                strip = True if "Fast" not in cls.__name__ else False
+                # legacy: we have to init with (rstrip=True, lstrip=True) (if the token is new? Failing test)
+                strip = True # if "Fast" not in cls.__name__ and token not in additional_special_tokens else False
                 added_tokens_decoder = {
                     index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items()
                 }

From fb80bf933a5ab5a06af41bd8920caa4e517d5f03 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Mon, 2 Oct 2023 17:01:18 +0200
Subject: [PATCH 06/12] update

---
 src/transformers/tokenization_utils_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 020a517e128e86..3a586b8becb09e 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2238,7 +2238,7 @@ def _from_pretrained(
                 with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                     added_tok_encoder = json.load(added_tokens_handle)
                 # legacy: we have to init with (rstrip=True, lstrip=True) (if the token is new? Failing test)
-                strip = True # if "Fast" not in cls.__name__ and token not in additional_special_tokens else False
+                strip = True if "Fast" not in cls.__name__ else False
                 added_tokens_decoder = {
                     index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items()
                 }

From a9b8845df640916ee4ee6c2bf4acbec4fa36ccb5 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Mon, 2 Oct 2023 17:48:11 +0200
Subject: [PATCH 07/12] revert bad merge

---
 src/transformers/tokenization_utils_base.py | 28 ++++++---------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 3a586b8becb09e..01fadbcacf765c 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2209,11 +2209,6 @@ def _from_pretrained(
                         f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary."
                     )
         else:
-            logger.warning_once(
-                "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, "
-                " it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again."
-                " You will see the new `added_tokens_decoder` attribute that will store the relevant information."
-            )
             # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
             if special_tokens_map_file is not None:
                 with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
@@ -2237,11 +2232,14 @@ def _from_pretrained(
             if added_tokens_file is not None:
                 with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                     added_tok_encoder = json.load(added_tokens_handle)
-                # legacy: we have to init with (rstrip=True, lstrip=True) (if the token is new? Failing test)
-                strip = True if "Fast" not in cls.__name__ else False
-                added_tokens_decoder = {
-                    index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items()
-                }
+                # legacy: we have to init with (rstrip=True, lstrip=True)
+                rstrip = lstrip = True if "Fast" not in cls.__name__ else False
+                for token, index in added_tok_encoder.items():
+                    if index in added_tokens_decoder and  "Fast" not in cls.__name__:
+                        continue
+                    added_tokens_decoder = {
+                        index: AddedToken(token, rstrip=rstrip, lstrip=lstrip) for token, index in added_tok_encoder.items()
+                    }
             # end legacy
 
         # slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved!
@@ -2282,16 +2280,6 @@ def _from_pretrained(
         # uses the information stored in `added_tokens_decoder`. Checks after addition that we have the same ids
         if init_kwargs.get("slow_to_fast", False):
             tokenizer.add_tokens([token for _, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])])
-            warnings = ""
-            for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0]):
-                if tokenizer.convert_tokens_to_ids(str(token)) != index:
-                    warnings += f"\texpected id: {tokenizer.convert_tokens_to_ids(str(token))}, found: {index},  token: `{token}`,\n"
-            if len(warnings) > 1:
-                logger.warn(
-                    f"You are converting a {slow_tokenizer.__class__.__name__} to a {cls.__name__}, but"
-                    f" wrong indexes were founds when adding the `added_tokens` from the `slow` tokenizer to the `fast`. "
-                    f" The following tokens had unexpected id :\n{warnings}. You should try using `from_slow`."
-                )
             # finally we add all the special_tokens to make sure eveything is initialized
             tokenizer.add_tokens(tokenizer.all_special_tokens_extended, special_tokens=True)
 

From 339ce67c7e44c151d2dd7f6c542cf7e255197114 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Mon, 2 Oct 2023 19:34:49 +0200
Subject: [PATCH 08/12] found the bug

---
 src/transformers/tokenization_utils_base.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 01fadbcacf765c..a569f03e9f9476 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -851,6 +851,8 @@ def __init__(self, verbose=True, **kwargs):
                 continue
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 if key == "additional_special_tokens":
+                    # TODO THIS IS NASTY! Will always reset tokens to default rstrip and lstrip because self.set_attr on strings
+                    # will not check the addedtokens decoder. WILL FIX TOMORROW
                     assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
                     assert all(
                         isinstance(t, (str, AddedToken)) for t in value
@@ -2196,11 +2198,10 @@ def _from_pretrained(
             for idx, token in init_kwargs["added_tokens_decoder"].items():
                 if isinstance(token, dict):
                     token = AddedToken(**token)
-
                 if isinstance(token, AddedToken):
                     added_tokens_decoder[int(idx)] = token
                     if str(token) in additional_special_tokens:
-                        # at this point if the token is in `additional_special_tokens` as an str, should be updated
+                        # at this point the token is in `additional_special_tokens` as an str, let's add the AddedToken info
                         additional_special_tokens.remove(str(token))
                     if token.special and token not in additional_special_tokens:
                         additional_special_tokens.append(token)
@@ -2235,11 +2236,9 @@ def _from_pretrained(
                 # legacy: we have to init with (rstrip=True, lstrip=True)
                 rstrip = lstrip = True if "Fast" not in cls.__name__ else False
                 for token, index in added_tok_encoder.items():
-                    if index in added_tokens_decoder and  "Fast" not in cls.__name__:
-                        continue
-                    added_tokens_decoder = {
-                        index: AddedToken(token, rstrip=rstrip, lstrip=lstrip) for token, index in added_tok_encoder.items()
-                    }
+                    if index not in added_tokens_decoder:
+                        rstrip = lstrip = False
+                    added_tokens_decoder = {index: AddedToken(token, rstrip=rstrip, lstrip=lstrip)}
             # end legacy
 
         # slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved!

From d093b5cfc866aedf5919376fe2601bdfa41380e5 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Tue, 3 Oct 2023 11:34:10 +0200
Subject: [PATCH 09/12] YES SIR

---
 src/transformers/tokenization_utils.py      |  8 ++++++++
 src/transformers/tokenization_utils_base.py | 17 +++++++++--------
 src/transformers/tokenization_utils_fast.py |  8 ++++++++
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index fa2902cfc25126..e68633ef139125 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -377,6 +377,14 @@ def added_tokens_decoder(self) -> Dict[int, AddedToken]:
         """
         return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
 
+    @property
+    def added_tokens_encoder(self) -> Dict[str, int]:
+        """
+        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
+        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
+        """
+        return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
+
     @added_tokens_decoder.setter
     def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]:
         # Always raise an error if string because users should define the behavior
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index a569f03e9f9476..92ad0a33594a83 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -846,6 +846,7 @@ def __init__(self, verbose=True, **kwargs):
         # We directly set the hidden value to allow initialization with special tokens
         # which are not yet in the vocabulary. Necessary for serialization/de-serialization
         # TODO clean this up at some point (probably by switching to fast tokenizers)
+
         for key, value in kwargs.items():
             if value is None:
                 continue
@@ -857,6 +858,14 @@ def __init__(self, verbose=True, **kwargs):
                     assert all(
                         isinstance(t, (str, AddedToken)) for t in value
                     ), "One of the tokens is not a string or an AddedToken"
+                    if hasattr(self, "added_tokens_encoder"):
+                        extended_token = []
+                        for token in value:
+                            if isinstance(token, str) and str(token) in self.added_tokens_encoder:
+                                extended_token.append(self.added_tokens_decoder[self.added_tokens_encoder[str(token)]])
+                            else:
+                                extended_token.append(token)
+                        value = extended_token
                     setattr(self, key, value)
                 elif isinstance(value, (str)):
                     value = AddedToken(value, normalized=False, special=True)
@@ -1676,14 +1685,6 @@ def _set_processor_class(self, processor_class: str):
         """Sets processor class as an attribute."""
         self._processor_class = processor_class
 
-    @property
-    def added_tokens_encoder(self) -> Dict[str, int]:
-        """
-        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
-        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
-        """
-        return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
-
     @property
     def added_tokens_decoder(self) -> Dict[int, AddedToken]:
         raise NotImplementedError()
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 45a6639e1caab8..aadab8262849c7 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -185,6 +185,14 @@ def get_vocab(self) -> Dict[str, int]:
     def vocab(self) -> Dict[str, int]:
         return self.get_vocab()
 
+    @property
+    def added_tokens_encoder(self) -> Dict[str, int]:
+        """
+        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
+        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
+        """
+        return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
+
     @property
     def added_tokens_decoder(self) -> Dict[int, AddedToken]:
         """

From c12a2f9071929dbc113d1ea44bbae60064d8cb48 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Tue, 3 Oct 2023 11:35:32 +0200
Subject: [PATCH 10/12] is that change really required?

---
 src/transformers/tokenization_utils_base.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 92ad0a33594a83..1f2cf6e436f3da 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2235,11 +2235,10 @@ def _from_pretrained(
                 with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                     added_tok_encoder = json.load(added_tokens_handle)
                 # legacy: we have to init with (rstrip=True, lstrip=True)
-                rstrip = lstrip = True if "Fast" not in cls.__name__ else False
-                for token, index in added_tok_encoder.items():
-                    if index not in added_tokens_decoder:
-                        rstrip = lstrip = False
-                    added_tokens_decoder = {index: AddedToken(token, rstrip=rstrip, lstrip=lstrip)}
+                strip = True if "Fast" not in cls.__name__ else False
+                added_tokens_decoder = {
+                    index: AddedToken(token, rstrip=strip, lstrip=strip) for token, index in added_tok_encoder.items()
+                }
             # end legacy
 
         # slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved!
@@ -2389,7 +2388,6 @@ def save_pretrained(
 
         tokenizer_config = copy.deepcopy(self.init_kwargs)
 
-        # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
         target_keys = list(self.init_kwargs.keys())
         target_keys += ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"]
         for k in target_keys:

From 02922e1bcc8a82b63f35cf128dc47721d0a23389 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Tue, 3 Oct 2023 12:02:28 +0200
Subject: [PATCH 11/12] make fast even faster

---
 src/transformers/tokenization_utils_fast.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index aadab8262849c7..2c6b3c167fecd4 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -210,10 +210,7 @@ def get_added_vocab(self) -> Dict[str, int]:
         Returns:
             `Dict[str, int]`: The added tokens.
         """
-        base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
-        full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
-        added_vocab = {tok: index for tok, index in full_vocab.items() if tok not in base_vocab}
-        return added_vocab
+        return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
 
     def __len__(self) -> int:
         """

From 93152be29a99c47eb5dee02e770ccc42f4f6d2c1 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Tue, 3 Oct 2023 12:09:20 +0200
Subject: [PATCH 12/12] re order functions

---
 src/transformers/tokenization_utils.py | 34 +++++++++++++-------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index e68633ef139125..2ceed1b46d4899 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -368,14 +368,15 @@ def __init__(self, **kwargs):
         self._decode_use_source_tokenizer = False
 
     @property
-    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
-        """
-        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
+    def is_fast(self) -> bool:
+        return False
 
-        Returns:
-            `Dict[str, int]`: The added tokens.
+    @property
+    def vocab_size(self) -> int:
         """
-        return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
+        `int`: Size of the base vocabulary (without the added tokens).
+        """
+        raise NotImplementedError
 
     @property
     def added_tokens_encoder(self) -> Dict[str, int]:
@@ -385,6 +386,16 @@ def added_tokens_encoder(self) -> Dict[str, int]:
         """
         return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
 
+    @property
+    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
+
+        Returns:
+            `Dict[str, int]`: The added tokens.
+        """
+        return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
+
     @added_tokens_decoder.setter
     def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]:
         # Always raise an error if string because users should define the behavior
@@ -397,17 +408,6 @@ def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict
             self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token
             self._added_tokens_encoder[str(token)] = index
 
-    @property
-    def is_fast(self) -> bool:
-        return False
-
-    @property
-    def vocab_size(self) -> int:
-        """
-        `int`: Size of the base vocabulary (without the added tokens).
-        """
-        raise NotImplementedError
-
     def get_added_vocab(self) -> Dict[str, int]:
         """
         Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from