Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GGUF: Fix llama 3 GGUF #31358

Merged
merged 18 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 56 additions & 18 deletions src/transformers/integrations/ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,15 +540,26 @@ def __init__(self, dict_):
self.merges = merges
else:
self.merges = [tuple(merge.split(" ")) for merge in self.merges]
if not hasattr(self, "scores"):
self.scores = [None for _ in range(len(self.tokens))]

if not hasattr(self, "added_tokens"):
self.added_tokens = []

if not hasattr(self, "unk_token_id"):
self.unk_token_id = None

# Llama2 uses the field `unknown_token_id`
if hasattr(self, "unknown_token_id") and self.unk_token_id is None:
self.unk_token_id = self.unknown_token_id


class GGUFLlamaConverter(LlamaConverter):
def __init__(self, tokenizer_dict):
self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
self.original_tokenizer = self.proto
self.additional_kwargs = {}
self.uses_byte_level_decoding = getattr(self.proto, "tokenizer_type", "llama") != "llama"

def vocab(self, proto):
return list(zip(proto.tokens, proto.scores))
Expand All @@ -560,30 +571,55 @@ def tokenizer(self, proto):
vocab_scores = self.vocab(self.proto)
merges = self.merges(self.proto)
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
tokenizer = Tokenizer(
BPE(bpe_vocab, merges, unk_token=proto.tokens[proto.unk_token_id], fuse_unk=True, byte_fallback=True)
)
tokenizer.add_special_tokens(
[
AddedToken("<unk>", normalized=False, special=True),
AddedToken("<s>", normalized=False, special=True),
AddedToken("</s>", normalized=False, special=True),
]
)

unk_token = proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
bos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "bos_token_id", None) is not None else None
eos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "eos_token_id", None) is not None else None

tokenizer = Tokenizer(BPE(bpe_vocab, merges, unk_token=unk_token, fuse_unk=True, byte_fallback=True))

special_tokens = []

if not hasattr(self.proto, "token_type"):
if unk_token is not None:
special_tokens.append(AddedToken(unk_token, normalized=False, special=True))

if bos_token is not None:
special_tokens.append(AddedToken(bos_token, normalized=False, special=True))

if eos_token is not None:
special_tokens.append(AddedToken(eos_token, normalized=False, special=True))
else:
# 3 stands for special tokens
special_tokens_idx = np.where(np.array(self.proto.token_type) == 3)[0]

for idx in special_tokens_idx:
special_tokens.append(AddedToken(self.proto.tokens[idx], normalized=False, special=True))

if len(special_tokens) != 0:
tokenizer.add_special_tokens(special_tokens)

if len(self.proto.added_tokens) != 0:
tokenizer.add_special_tokens(
younesbelkada marked this conversation as resolved.
Show resolved Hide resolved
younesbelkada marked this conversation as resolved.
Show resolved Hide resolved
[AddedToken(added_token, normalized=False, special=False) for added_token in self.added_tokens]
[AddedToken(added_token, normalized=False, special=False) for added_token in self.proto.added_tokens]
)

self.additional_kwargs["unk_token"] = unk_token
self.additional_kwargs["eos_token"] = bos_token
self.additional_kwargs["bos_token"] = eos_token

return tokenizer

def decoder(self, replacement, add_prefix_space):
sequence = [
decoders.ByteFallback(),
decoders.Fuse(),
decoders.Replace("▁", " "),
]
if not self.uses_byte_level_decoding:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pre_tokenizer should alsobe ByteLevel

sequence = [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can have bytefallback along with byteLevel decoding!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is misleading indeed, i changed a bit the arg name

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still wrong! the sequence should always be bytefallbacl fuse and replace, but if you are a llama3, you add the bytelebel as well

decoders.ByteFallback(),
decoders.Fuse(),
decoders.Replace("▁", " "),
]
else:
sequence = [decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)]

if add_prefix_space:
sequence += [decoders.Strip(content=" ", left=1)]
return decoders.Sequence(sequence)
Expand All @@ -592,6 +628,7 @@ def decoder(self, replacement, add_prefix_space):
class GGUFQwen2Converter(Qwen2Converter):
def __init__(self, tokenizer_dict):
self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
self.additional_kwargs = {}

def converted(self) -> Tokenizer:
vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
Expand Down Expand Up @@ -629,5 +666,6 @@ def convert_gguf_tokenizer(architecture, tokenizer_dict) -> Tokenizer:
[`~tokenization_utils_base.PreTrainedTokenizerFast`]
"""
tokenizer_class_name = architecture
converter_class = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name]
return converter_class(tokenizer_dict).converted()
converter = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name](tokenizer_dict)
fast_tokenizer = converter.converted()
return fast_tokenizer, converter.additional_kwargs
7 changes: 6 additions & 1 deletion src/transformers/tokenization_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,11 @@ def __init__(self, *args, **kwargs):
gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file"))
architecture = gguf_param["config"]["model_type"]
tokenizer_dict = gguf_param["tokenizer"]
fast_tokenizer = convert_gguf_tokenizer(architecture, tokenizer_dict)
fast_tokenizer, additional_kwargs = convert_gguf_tokenizer(architecture, tokenizer_dict)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah sound good to me TBH


if len(additional_kwargs) > 0:
kwargs.update(additional_kwargs)

elif self.slow_tokenizer_class is not None:
# We need to create and convert a slow tokenizer to build the backend
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
Expand Down Expand Up @@ -182,6 +186,7 @@ def __init__(self, *args, **kwargs):
tokens_to_add += [
token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add
]

if len(tokens_to_add) > 0:
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
Expand Down
15 changes: 15 additions & 0 deletions tests/quantization/ggml/test_ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class GgufIntegrationTests(unittest.TestCase):
model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF"

q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
Expand All @@ -43,6 +44,7 @@ class GgufIntegrationTests(unittest.TestCase):

q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf"
q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf"
q4_llama3_model_id = "Meta-Llama-3-8B-Q4_K_M.gguf"

example_text = "Hello"

Expand Down Expand Up @@ -171,6 +173,19 @@ def test_qwen2_q4_0(self):
EXPECTED_TEXT = "Hello.jsoup\n\nI am a beginner"
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)

def test_llama3_q4_0(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The string that you are testing could be improved! Specifically to take into account the special tokens and etc.
But alright otherwise!

tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
model = AutoModelForCausalLM.from_pretrained(
self.llama3_model_id, gguf_file=self.q4_llama3_model_id, device_map="auto", torch_dtype=torch.float16
)

text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
out = model.generate(**text, max_new_tokens=10)

EXPECTED_TEXT = "Hello, I am new to this forum. I am"

self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)

def test_tokenization_xnli(self):
import tqdm
from datasets import load_dataset
Expand Down
Loading