Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LlamaTokenizerFast] Refactor default llama #28881

Merged
merged 28 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
689c9aa
push legacy to fast as well
ArthurZucker Feb 6, 2024
7a3ddf4
super strange
ArthurZucker Feb 6, 2024
d754f3b
Update src/transformers/convert_slow_tokenizer.py
ArthurZucker Feb 20, 2024
e1ef2d3
make sure we are BC
ArthurZucker Mar 22, 2024
0897c61
fix Llama test
ArthurZucker Mar 22, 2024
b188318
nit
ArthurZucker Mar 22, 2024
f7e8b06
revert
ArthurZucker Mar 22, 2024
d939358
more test
ArthurZucker Mar 22, 2024
5c48f71
Merge branch 'main' of github.com:huggingface/transformers into refac…
ArthurZucker Mar 22, 2024
ce042c6
style
ArthurZucker Mar 22, 2024
bbd26b0
update
ArthurZucker Mar 25, 2024
84d406c
small update w.r.t tokenizers
ArthurZucker Mar 30, 2024
038869d
Merge branch 'main' of github.com:huggingface/transformers into refac…
ArthurZucker Apr 22, 2024
6fbb0ac
nit
ArthurZucker Apr 22, 2024
6bd696e
don't split
ArthurZucker Apr 22, 2024
436335d
lol
ArthurZucker Apr 23, 2024
afa51b1
add a test for `add_prefix_space=False`
ArthurZucker Apr 23, 2024
312cc1d
fix gemma tokenizer as well
ArthurZucker Apr 23, 2024
feeec97
update
ArthurZucker Apr 23, 2024
e7953f5
fix gemma
ArthurZucker Apr 23, 2024
ca25a81
nicer failures
ArthurZucker Apr 23, 2024
7bfe577
fixup
ArthurZucker Apr 23, 2024
05cd744
update
ArthurZucker Apr 23, 2024
9c92741
fix the example for legacy = False
ArthurZucker Apr 23, 2024
187efad
use `huggyllama/llama-7b` for the PR doctest
ArthurZucker Apr 23, 2024
9eac9c7
nit
ArthurZucker Apr 23, 2024
4c40705
use from_slow
ArthurZucker Apr 23, 2024
e775b7d
fix llama
ArthurZucker Apr 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:

# there is a missing token in the vocab. We have to do this to support merges
# "<0x09>" is the bytefallback for `\t`
vocab["\t"] = vocab.pop("<0x09>")
vocab["\t"] = vocab.get("<0x09>")
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved

if vocab_scores is not None:
vocab_scores, reverse = dict(vocab_scores), True
Expand Down Expand Up @@ -1276,7 +1276,7 @@ def vocab(self, proto):
return vocab

def pre_tokenizer(self, replacement, add_prefix_space):
return None
return pre_tokenizers.Split(" ", "merged_with_previous")

def unk_id(self, proto):
unk_id = 3
Expand Down Expand Up @@ -1329,7 +1329,7 @@ def tokenizer(self, proto):
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
)
user_defined_symbols = [
AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols
AddedToken(token, normalized=True, special=False) for token in proto.trainer_spec.user_defined_symbols
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a must? Unless with only the split this works.

]
tokenizer.add_tokens(user_defined_symbols)
return tokenizer
Expand Down Expand Up @@ -1393,14 +1393,18 @@ def tokenizer(self, proto):
return tokenizer

def normalizer(self, proto):
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
sequence = []
if hasattr(self.original_tokenizer, "add_prefix_space"):
if self.original_tokenizer.add_prefix_space:
if self.original_tokenizer.legacy:
sequence = []
if getattr(self.original_tokenizer, "add_prefix_space"):
sequence += [normalizers.Prepend(prepend="▁")]
sequence += [normalizers.Replace(pattern=" ", content="▁")]
return normalizers.Sequence(sequence)
sequence += [normalizers.Replace(pattern=" ", content="▁")]
return normalizers.Sequence(sequence)
return None # non-legacy, no normalizer

def pre_tokenizer(self, replacement, add_prefix_space):
if not self.original_tokenizer.legacy: # non-legacy, we need a replace
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme, split=False)
return None

def post_processor(self):
Expand Down
24 changes: 12 additions & 12 deletions src/transformers/models/llama/tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,30 +99,30 @@ class LlamaTokenizer(PreTrainedTokenizer):
Whether or not to add spaces between special tokens.
legacy (`bool`, *optional*):
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
example:
and #25224 which includes fixes to properly handle tokens that appear after special tokens.
Make sure to also set `from_slow` to `True`.
A simple example:

- `legacy=True`:
```python
>>> from transformers import T5Tokenizer
>>> from transformers import LlamaTokenizerFast

>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
>>> tokenizer.encode("Hello <extra_id_0>.")
[8774, 32099, 3, 5, 1]
>>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
>>> tokenizer.encode("Hello <s>.") # 869 is '▁.'
[1, 15043, 29871, 1, 869]
```
- `legacy=False`:
```python
>>> from transformers import T5Tokenizer
>>> from transformers import LlamaTokenizerFast

>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
>>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here
[8774, 32099, 5, 1]
>>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
>>> tokenizer.encode("Hello <s>.") # 29889 is '.'
[1, 15043, 29871, 1, 29889]
```
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
add_prefix_space (`bool`, *optional*, defaults to `True`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word.

other word. Again, this should be set with `from_slow=True` to make sure it's taken into account.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand Down
37 changes: 36 additions & 1 deletion src/transformers/models/llama/tokenization_llama_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,30 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences.
use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used.
Whether or not the default system prompt for Llama should be used
legacy (`bool`, *optional*):
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
and #25224 which includes fixes to properly handle tokens that appear after special tokens.
Make sure to also set `from_slow` to `True`.
A simple example:

- `legacy=True`:
```python
>>> from transformers import LlamaTokenizerFast

>>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
>>> tokenizer.encode("Hello <s>.") # 869 is '▁.'
[1, 15043, 29871, 1, 869]
```
- `legacy=False`:
```python
>>> from transformers import LlamaTokenizerFast

>>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
>>> tokenizer.encode("Hello <s>.") # 29889 is '.'
[1, 15043, 29871, 1, 29889]
```
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
add_prefix_space (`bool`, *optional*):
Whether or not the tokenizer should automatically add a prefix space
"""
Expand All @@ -112,9 +135,21 @@ def __init__(
add_bos_token=True,
add_eos_token=False,
use_default_system_prompt=False,
legacy=None,
add_prefix_space=None,
**kwargs,
):
if legacy is None:
logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. This is"
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thoroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565"
)
legacy = True
self.legacy = legacy

if add_prefix_space is not None:
logger.warning_once(
"You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
Expand Down
14 changes: 11 additions & 3 deletions tests/models/gemma/test_tokenization_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
get_tests_dir,
nested_simplify,
require_jinja,
require_read_token,
require_sentencepiece,
require_tokenizers,
require_torch,
Expand Down Expand Up @@ -136,11 +137,12 @@ def test_special_tokens_initialization(self):
self.assertTrue(special_token_id in cr_output)

@slow
@require_read_token
def test_tokenizer_integration(self):
expected_encoding = {'input_ids': [[2, 158434, 591, 84193, 3836, 685, 6599, 31223, 235290, 140247, 578, 6599, 31223, 235290, 145139, 235290, 3491, 235275, 6572, 3311, 235290, 38197, 109959, 591, 25894, 235269, 162174, 235290, 235284, 235269, 1791, 6362, 12481, 235269, 1576, 18622, 235269, 2900, 1136, 86684, 235269, 29092, 4632, 16994, 604, 13146, 14944, 40371, 591, 19700, 235327, 235275, 578, 13146, 14944, 25511, 591, 235300, 12474, 235275, 675, 1163, 235248, 235304, 235284, 235340, 229903, 5377, 575, 235248, 235274, 235276, 235276, 235340, 17044, 578, 5271, 1061, 118345, 1865, 125247, 235269, 8745, 111226, 578, 176888, 235265], [2, 25894, 603, 6869, 577, 953, 235290, 8297, 5271, 209099, 41642, 774, 748, 78253, 2793, 731, 51506, 34346, 611, 2145, 2731, 578, 1833, 4807, 575, 832, 16630, 235265], [2, 651, 4320, 8426, 25341, 36271, 1163, 573, 27894, 5929, 235265]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
self.tokenizer_integration_test_util(
expected_encoding=expected_encoding,
model_name="hf-internal-testing/dummy-gemma",
model_name="google/gemma-2b",
revision="",
padding=False,
)
Expand Down Expand Up @@ -318,7 +320,13 @@ def test_integration_test_xnli(self):
encoded1 = pyth_tokenizer.encode(string)
encoded2 = rust_tokenizer.encode(string)

self.assertEqual(encoded1, encoded2)
self.assertEqual(
encoded1,
encoded2,
msg="Hint: the following tokenization diff were obtained for slow vs fast:\n "
f"elements in slow: {set(pyth_tokenizer.tokenize(string))-set(rust_tokenizer.tokenize(string))} \nvs\n "
f"elements in fast: {set(rust_tokenizer.tokenize(string))-set(pyth_tokenizer.tokenize(string))} \n\n{string}",
)

decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
decoded2 = rust_tokenizer.decode(encoded1, skip_special_tokens=True)
Expand All @@ -332,7 +340,7 @@ def test_integration_test_xnli(self):
encoded1 = pyth_tokenizer.encode(string)
encoded2 = rust_tokenizer.encode(string)

self.assertEqual(encoded1, encoded2)
self.assertEqual(encoded1, encoded2, msg=f"failed on {string}")

decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
Expand Down
78 changes: 71 additions & 7 deletions tests/models/llama/test_tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,8 +543,15 @@ def test_integration_test_xnli(self):

def test_special_token_special_word(self):
# the word inform should be split as ['in', 'form']
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
tokenizer.add_tokens([AddedToken("<REPR_END>", rstrip=True, lstrip=True)], special_tokens=False)

example_inputs = tokenizer.tokenize("<REPR_END>inform<s>. Hey. .")
self.assertEqual(example_inputs, ["<REPR_END>", "in", "form", "<s>", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])

# Make sure dummy space is added if it is indeed the first word
example_inputs = tokenizer.tokenize("inform<s>. Hey. .")
self.assertEqual(example_inputs, ["▁inform", "<s>", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])
out1 = tokenizer.decode(
tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
)
Expand All @@ -553,12 +560,12 @@ def test_special_token_special_word(self):
tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=True
)
# decoding strips the added prefix space.
self.assertEqual(out2, "<REPR_END> inform")
self.assertEqual(out2, "<REPR_END>inform")
input_ids = tokenizer.encode("<REPR_END>inform", add_special_tokens=False)
self.assertEqual(input_ids, [29871, 32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should
self.assertEqual(input_ids, [32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should

out2 = tokenizer.decode(
tokenizer.encode(" <REPR_END> inform", add_special_tokens=False), spaces_between_special_tokens=False
tokenizer.encode(" <REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
)
# TODO @ArthurZ currently we strip left and right, so this will not keep the spaces
self.assertEqual(out2, "<REPR_END>inform")
Expand All @@ -575,11 +582,11 @@ def test_special_token_special_word(self):

# Let's make sure that if there are any spaces, we don't remove them!
input_ids = tokenizer.encode(" <s> Hello<s> how", add_special_tokens=False)
self.assertEqual(input_ids, [259, 1, 15043, 1, 920])
self.assertEqual(input_ids, [29871, 1, 15043, 1, 920])
tokens = tokenizer.tokenize(" <s> Hello<s> how", add_special_tokens=False)
self.assertEqual(tokens, ["▁", "<s>", "▁Hello", "<s>", "▁how"])
self.assertEqual(tokens, ["▁", "<s>", "▁Hello", "<s>", "▁how"])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, " <s> Hello<s> how")
self.assertEqual(decoded_tokens, "<s> Hello<s> how")

# Let's make sure the space is preserved
input_ids = tokenizer.encode("hello", add_special_tokens=True)
Expand All @@ -594,6 +601,63 @@ def test_special_token_special_word(self):
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "hello")

def test_no_prefix_space(self):
tokenizer = LlamaTokenizerFast.from_pretrained(
"huggyllama/llama-7b", legacy=False, from_slow=True, add_prefix_space=False
)
tokenizer.add_tokens([AddedToken("<REPR_END>", rstrip=True, lstrip=True)], special_tokens=False)

example_inputs = tokenizer.tokenize("<REPR_END>inform<s>. Hey. .")
self.assertEqual(example_inputs, ["<REPR_END>", "in", "form", "<s>", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])

# Make sure dummy space is added if it is indeed the first word
example_inputs = tokenizer.tokenize("inform<s>. Hey. .")
self.assertEqual(example_inputs, ["in", "form", "<s>", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])
out1 = tokenizer.decode(
tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
)
self.assertEqual(out1, "<REPR_END>inform")
out2 = tokenizer.decode(
tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=True
)
# decoding strips the added prefix space.
self.assertEqual(out2, "<REPR_END>inform")
input_ids = tokenizer.encode("<REPR_END>inform", add_special_tokens=False)
self.assertEqual(input_ids, [32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should

out2 = tokenizer.decode(
tokenizer.encode(" <REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
)
self.assertEqual(out2, "<REPR_END>inform")

input_ids = tokenizer.encode("<s> Hello<s>how", add_special_tokens=False)
self.assertEqual(input_ids, [1, 15043, 1, 3525])
tokens = tokenizer.tokenize("<s> Hello<s>how", add_special_tokens=False)
self.assertEqual(tokens, ["<s>", "▁Hello", "<s>", "how"])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "<s> Hello<s>how")

# Let's make sure that if there are any spaces, we don't remove them!
input_ids = tokenizer.encode(" <s> Hello<s> how", add_special_tokens=False)
self.assertEqual(input_ids, [29871, 1, 15043, 1, 920])
tokens = tokenizer.tokenize(" <s> Hello<s> how", add_special_tokens=False)
self.assertEqual(tokens, ["▁", "<s>", "▁Hello", "<s>", "▁how"])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, " <s> Hello<s> how")

# Let's make sure the space is preserved
input_ids = tokenizer.encode("hello", add_special_tokens=True)
self.assertEqual(input_ids, [1, 12199])
tokens = tokenizer.tokenize("hello")
self.assertEqual(tokens, ["hello"])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "<s>hello")

input_ids = tokenizer.encode("hello", add_special_tokens=False)
self.assertEqual(input_ids, [12199])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "hello")

def test_some_edge_cases(self):
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)

Expand Down
Loading