Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LlamaTokenizerFast] Refactor default llama #28881

Merged
merged 28 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
689c9aa
push legacy to fast as well
ArthurZucker Feb 6, 2024
7a3ddf4
super strange
ArthurZucker Feb 6, 2024
d754f3b
Update src/transformers/convert_slow_tokenizer.py
ArthurZucker Feb 20, 2024
e1ef2d3
make sure we are BC
ArthurZucker Mar 22, 2024
0897c61
fix Llama test
ArthurZucker Mar 22, 2024
b188318
nit
ArthurZucker Mar 22, 2024
f7e8b06
revert
ArthurZucker Mar 22, 2024
d939358
more test
ArthurZucker Mar 22, 2024
5c48f71
Merge branch 'main' of github.com:huggingface/transformers into refac…
ArthurZucker Mar 22, 2024
ce042c6
style
ArthurZucker Mar 22, 2024
bbd26b0
update
ArthurZucker Mar 25, 2024
84d406c
small update w.r.t tokenizers
ArthurZucker Mar 30, 2024
038869d
Merge branch 'main' of github.com:huggingface/transformers into refac…
ArthurZucker Apr 22, 2024
6fbb0ac
nit
ArthurZucker Apr 22, 2024
6bd696e
don't split
ArthurZucker Apr 22, 2024
436335d
lol
ArthurZucker Apr 23, 2024
afa51b1
add a test for `add_prefix_space=False`
ArthurZucker Apr 23, 2024
312cc1d
fix gemma tokenizer as well
ArthurZucker Apr 23, 2024
feeec97
update
ArthurZucker Apr 23, 2024
e7953f5
fix gemma
ArthurZucker Apr 23, 2024
ca25a81
nicer failures
ArthurZucker Apr 23, 2024
7bfe577
fixup
ArthurZucker Apr 23, 2024
05cd744
update
ArthurZucker Apr 23, 2024
9c92741
fix the example for legacy = False
ArthurZucker Apr 23, 2024
187efad
use `huggyllama/llama-7b` for the PR doctest
ArthurZucker Apr 23, 2024
9eac9c7
nit
ArthurZucker Apr 23, 2024
4c40705
use from_slow
ArthurZucker Apr 23, 2024
e775b7d
fix llama
ArthurZucker Apr 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from typing import Dict, List, Tuple

from packaging import version

from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece

Expand Down Expand Up @@ -1393,14 +1394,18 @@ def tokenizer(self, proto):
return tokenizer

def normalizer(self, proto):
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
sequence = []
if hasattr(self.original_tokenizer, "add_prefix_space"):
if self.original_tokenizer.add_prefix_space:
if self.original_tokenizer.legacy:
sequence = []
if getattr(self.original_tokenizer, "add_prefix_space"):
sequence += [normalizers.Prepend(prepend="▁")]
sequence += [normalizers.Replace(pattern=" ", content="▁")]
return normalizers.Sequence(sequence)
sequence += [normalizers.Replace(pattern=" ", content="▁")]
return normalizers.Sequence(sequence)
return None # non-legacy, no normalizer

def pre_tokenizer(self, replacement, add_prefix_space):
if not self.original_tokenizer.legacy: # non-legacy, we need a replace
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme, split=False)
return None

def post_processor(self):
Expand Down
35 changes: 35 additions & 0 deletions src/transformers/models/llama/tokenization_llama_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,29 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences.
use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used
legacy (`bool`, *optional*):
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
example:

- `legacy=True`:
```python
>>> from transformers import T5Tokenizer

>>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
>>> tokenizer.encode("Hello <extra_id_0>.")
[8774, 32099, 3, 5, 1]
```
- `legacy=False`:
```python
>>> from transformers import T5Tokenizer

>>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
>>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here
[8774, 32099, 5, 1]
```
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
Whether or not the default system prompt for Llama should be used.
add_prefix_space (`bool`, *optional*):
Whether or not the tokenizer should automatically add a prefix space
Expand All @@ -112,9 +135,21 @@ def __init__(
add_bos_token=True,
add_eos_token=False,
use_default_system_prompt=False,
legacy=None,
add_prefix_space=None,
**kwargs,
):
if legacy is None:
logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. This is"
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thoroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565"
)
legacy = True
self.legacy = legacy

if add_prefix_space is not None:
logger.warning_once(
"You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
Expand Down
76 changes: 69 additions & 7 deletions tests/models/llama/test_tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,8 +543,15 @@ def test_integration_test_xnli(self):

def test_special_token_special_word(self):
# the word inform should be split as ['in', 'form']
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
tokenizer.add_tokens([AddedToken("<REPR_END>", rstrip=True, lstrip=True)], special_tokens=False)

example_inputs = tokenizer.tokenize("<REPR_END>inform<s>. Hey. .")
self.assertEqual(example_inputs, ["<REPR_END>", "in", "form", "<s>", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])

# Make sure dummy space is added if it is indeed the first word
example_inputs = tokenizer.tokenize("inform<s>. Hey. .")
self.assertEqual(example_inputs, ["▁inform", "<s>", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])
out1 = tokenizer.decode(
tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
)
Expand All @@ -553,12 +560,12 @@ def test_special_token_special_word(self):
tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=True
)
# decoding strips the added prefix space.
self.assertEqual(out2, "<REPR_END> inform")
self.assertEqual(out2, "<REPR_END>inform")
input_ids = tokenizer.encode("<REPR_END>inform", add_special_tokens=False)
self.assertEqual(input_ids, [29871, 32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should
self.assertEqual(input_ids, [32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should

out2 = tokenizer.decode(
tokenizer.encode(" <REPR_END> inform", add_special_tokens=False), spaces_between_special_tokens=False
tokenizer.encode(" <REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
)
# TODO @ArthurZ currently we strip left and right, so this will not keep the spaces
self.assertEqual(out2, "<REPR_END>inform")
Expand All @@ -575,11 +582,11 @@ def test_special_token_special_word(self):

# Let's make sure that if there are any spaces, we don't remove them!
input_ids = tokenizer.encode(" <s> Hello<s> how", add_special_tokens=False)
self.assertEqual(input_ids, [259, 1, 15043, 1, 920])
self.assertEqual(input_ids, [29871, 1, 15043, 1, 920])
tokens = tokenizer.tokenize(" <s> Hello<s> how", add_special_tokens=False)
self.assertEqual(tokens, ["▁", "<s>", "▁Hello", "<s>", "▁how"])
self.assertEqual(tokens, ["▁", "<s>", "▁Hello", "<s>", "▁how"])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, " <s> Hello<s> how")
self.assertEqual(decoded_tokens, "<s> Hello<s> how")

# Let's make sure the space is preserved
input_ids = tokenizer.encode("hello", add_special_tokens=True)
Expand All @@ -594,6 +601,61 @@ def test_special_token_special_word(self):
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "hello")

def test_no_prefix_space(self):
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True, add_prefix_space=False)
tokenizer.add_tokens([AddedToken("<REPR_END>", rstrip=True, lstrip=True)], special_tokens=False)

example_inputs = tokenizer.tokenize("<REPR_END>inform<s>. Hey. .")
self.assertEqual(example_inputs, ["<REPR_END>", "in", "form", "<s>", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])

# Make sure dummy space is added if it is indeed the first word
example_inputs = tokenizer.tokenize("inform<s>. Hey. .")
self.assertEqual(example_inputs, ["in", "form", "<s>", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])
out1 = tokenizer.decode(
tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
)
self.assertEqual(out1, "<REPR_END>inform")
out2 = tokenizer.decode(
tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=True
)
# decoding strips the added prefix space.
self.assertEqual(out2, "<REPR_END>inform")
input_ids = tokenizer.encode("<REPR_END>inform", add_special_tokens=False)
self.assertEqual(input_ids, [32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should

out2 = tokenizer.decode(
tokenizer.encode(" <REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
)
self.assertEqual(out2, "<REPR_END>inform")

input_ids = tokenizer.encode("<s> Hello<s>how", add_special_tokens=False)
self.assertEqual(input_ids, [1, 15043, 1, 3525])
tokens = tokenizer.tokenize("<s> Hello<s>how", add_special_tokens=False)
self.assertEqual(tokens, ["<s>", "▁Hello", "<s>", "how"])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "<s> Hello<s>how")

# Let's make sure that if there are any spaces, we don't remove them!
input_ids = tokenizer.encode(" <s> Hello<s> how", add_special_tokens=False)
self.assertEqual(input_ids, [29871, 1, 15043, 1, 920])
tokens = tokenizer.tokenize(" <s> Hello<s> how", add_special_tokens=False)
self.assertEqual(tokens, ["▁", "<s>", "▁Hello", "<s>", "▁how"])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, " <s> Hello<s> how")

# Let's make sure the space is preserved
input_ids = tokenizer.encode("hello", add_special_tokens=True)
self.assertEqual(input_ids, [1, 12199])
tokens = tokenizer.tokenize("hello")
self.assertEqual(tokens, ["hello"])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "<s>hello")

input_ids = tokenizer.encode("hello", add_special_tokens=False)
self.assertEqual(input_ids, [12199])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "hello")

def test_some_edge_cases(self):
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)

Expand Down
Loading