Skip to content

Commit

Permalink
[Tokenizer] Fix slow and fast serialization (#26570)
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Oct 18, 2023
1 parent 75c4250 commit 0c4b637
Show file tree
Hide file tree
Showing 49 changed files with 508 additions and 238 deletions.
1 change: 1 addition & 0 deletions .circleci/create_circleci_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def to_dict(self):
},
]
steps.extend([{"run": l} for l in self.install_steps])
steps.extend([{"run": "pip install pytest-subtests"}])
steps.append(
{
"save_cache": {
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1168,9 +1168,9 @@ def tokenizer(self, proto):
)
tokenizer.add_special_tokens(
[
AddedToken("<unk>"),
AddedToken("<s>"),
AddedToken("</s>"),
AddedToken("<unk>", normalized=False, special=True),
AddedToken("<s>", normalized=False, special=True),
AddedToken("</s>", normalized=False, special=True),
]
)
else:
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/models/bart/tokenization_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,6 @@ def __init__(
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token

# Mask token behave like a normal word, i.e. include the space before it
# TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
# Also this not only will strip the spaces but any punctuation
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

with open(vocab_file, encoding="utf-8") as vocab_handle:
Expand Down
7 changes: 6 additions & 1 deletion src/transformers/models/bart/tokenization_bart_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,12 @@ def __init__(
trim_offsets=True,
**kwargs,
):
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens`
mask_token = (
AddedToken(mask_token, lstrip=True, normalized=True, special=True)
if isinstance(mask_token, str)
else mask_token
)
super().__init__(
vocab_file,
merges_file,
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/barthez/tokenization_barthez.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def __init__(
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# Mask token behave like a normal word, i.e. include the space before it. Will have normalized=False by default this way
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/bertweet/tokenization_bertweet.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,10 @@ def __init__(
self.merges_file = merges_file

self.encoder = {}
self.encoder[bos_token] = 0
self.encoder[pad_token] = 1
self.encoder[eos_token] = 2
self.encoder[unk_token] = 3
self.encoder[str(bos_token)] = 0
self.encoder[str(pad_token)] = 1
self.encoder[str(eos_token)] = 2
self.encoder[str(unk_token)] = 3

self.add_from_file(vocab_file)

Expand Down
20 changes: 12 additions & 8 deletions src/transformers/models/camembert/tokenization_camembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
additional_special_tokens (`List[str]`, *optional*, defaults to `['<s>NOTUSED', '</s>NOTUSED', '<unk>NOTUSED']`):
Additional special tokens used by the tokenizer.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
Expand Down Expand Up @@ -127,12 +127,16 @@ def __init__(
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True)
if isinstance(mask_token, str)
else mask_token
)

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

Expand All @@ -144,11 +148,11 @@ def __init__(
# sentencepiece vocabulary (this is the case for <s> and </s> and <unk>).
# In this case it is recommended to properly set the tokens by hand.
self._added_tokens_decoder = {
0: AddedToken("<s>NOTUSED"),
1: AddedToken(pad_token),
2: AddedToken("</s>NOTUSED"),
3: AddedToken(unk_token),
4: AddedToken("<unk>NOTUSED"),
0: AddedToken("<s>NOTUSED", special=True),
1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
2: AddedToken("</s>NOTUSED", special=True),
3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
4: AddedToken("<unk>NOTUSED", special=True),
}

self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,11 @@ def __init__(
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
**kwargs,
):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

# Mask token behave like a normal word, i.e. include the space before it. Will have normalized = False
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/codegen/tokenization_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,10 @@ def __init__(
add_bos_token=False,
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
self.add_bos_token = add_bos_token

with open(vocab_file, encoding="utf-8") as vocab_handle:
Expand Down
12 changes: 6 additions & 6 deletions src/transformers/models/deberta/tokenization_deberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,12 @@ def __init__(
add_bos_token=False,
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token

# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def __init__(
self._tokenizer = SPMTokenizer(
vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
)
unk_token = AddedToken(unk_token, normalized=True, lstrip=False, rstrip=False)
unk_token = AddedToken(unk_token, normalized=True, special=True) if isinstance(unk_token, str) else unk_token
super().__init__(
do_lower_case=do_lower_case,
bos_token=bos_token,
Expand Down
7 changes: 4 additions & 3 deletions src/transformers/models/fnet/tokenization_fnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,10 @@ def __init__(
) -> None:
# Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence.
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

self.do_lower_case = do_lower_case
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import unicodedata
from typing import Dict, List, Optional, Tuple, Union

from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...tokenization_utils_base import (
BatchEncoding,
EncodedInput,
Expand Down Expand Up @@ -244,6 +244,12 @@ def __init__(
additional_special_tokens: Optional[List[str]] = None,
**kwargs,
):
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token

if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def __init__(
**kwargs,
) -> None:
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

Expand Down
2 changes: 0 additions & 2 deletions src/transformers/models/led/tokenization_led.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,6 @@ def __init__(
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token

# Mask token behave like a normal word, i.e. include the space before it
# TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
# Also this not only will strip the spaces but any punctuation
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

with open(vocab_file, encoding="utf-8") as vocab_handle:
Expand Down
7 changes: 6 additions & 1 deletion src/transformers/models/led/tokenization_led_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,12 @@ def __init__(
trim_offsets=True,
**kwargs,
):
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens`
mask_token = (
AddedToken(mask_token, lstrip=True, normalized=True, special=True)
if isinstance(mask_token, str)
else mask_token
)
super().__init__(
vocab_file,
merges_file,
Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/llama/tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ def __init__(
**kwargs,
):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token

if legacy is None:
logger.warning_once(
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/marian/tokenization_marian.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@ def __init__(

self.separate_vocabs = separate_vocabs
self.encoder = load_json(vocab)
if unk_token not in self.encoder:
if str(unk_token) not in self.encoder:
raise KeyError("<unk> token must be in the vocab")
assert pad_token in self.encoder
assert str(pad_token) in self.encoder

if separate_vocabs:
self.target_encoder = load_json(target_vocab_file)
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/models/mbart/tokenization_mbart.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def __init__(
**kwargs,
):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
mask_token = (
AddedToken(mask_token, lstrip=True, normalized=False) if isinstance(mask_token, str) else mask_token
)

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mbart50/tokenization_mbart50.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def __init__(

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
kwargs["additional_special_tokens"] += [
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
Expand Down
19 changes: 10 additions & 9 deletions src/transformers/models/mpnet/tokenization_mpnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,15 +147,15 @@ def __init__(
strip_accents=None,
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token

# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token

if not os.path.isfile(vocab_file):
raise ValueError(
Expand Down Expand Up @@ -199,8 +199,9 @@ def vocab_size(self):
return len(self.vocab)

def get_vocab(self):
vocab = self.vocab.copy()
vocab.update(self.added_tokens_encoder)
# "<mask>" is part of the vocab, but was wrongfully added at a wrong index in the fast saved version
vocab = self.added_tokens_encoder.copy()
vocab.update(self.vocab)
return vocab

def _tokenize(self, text):
Expand Down
Loading

0 comments on commit 0c4b637

Please sign in to comment.