From d146334c1165f21ffac9a7c2f5a944f12cd085ea Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Tue, 17 Dec 2024 09:46:19 +0000 Subject: [PATCH 1/7] Add Falcon3 model support --- convert_hf_to_gguf.py | 6 ++++++ convert_hf_to_gguf_update.py | 1 + src/llama.cpp | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9dc1673bc2c06..66e268af61419 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -525,6 +525,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: else: token: str = reverse_vocab[i] if token in added_vocab: + # We need to manually encode and decode the added tokens in case special characters + # used for `\n` / `\t` have been manually added in the added tokens + token = tokenizer.decode(tokenizer.encode(token)) if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: @@ -571,6 +574,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" + if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": + # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base + res = "falcon3" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 88058442f6dc4..2ba346640b352 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -72,6 +72,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + {"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", }, {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, diff --git a/src/llama.cpp b/src/llama.cpp index 8b799e0ebeda7..1cc8a93323b4a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1612,6 +1612,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN, LLM_CHAT_TEMPLATE_MISTRAL_V7, LLM_CHAT_TEMPLATE_PHI_3, + LLM_CHAT_TEMPLATE_FALCON_3, LLM_CHAT_TEMPLATE_ZEPHYR, LLM_CHAT_TEMPLATE_MONARCH, LLM_CHAT_TEMPLATE_GEMMA, @@ -1644,6 +1645,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN }, { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 }, { "phi3", LLM_CHAT_TEMPLATE_PHI_3 }, + { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 }, { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR }, { "monarch", LLM_CHAT_TEMPLATE_MONARCH }, { "gemma", LLM_CHAT_TEMPLATE_GEMMA }, @@ -6473,6 +6475,11 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "falcon") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; + } else if ( + tokenizer_pre == "falcon3") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; + vocab.tokenizer_ignore_merges = true; + vocab.tokenizer_add_bos = true; } else if ( tokenizer_pre == "mpt") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT; @@ -22219,6 +22226,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) { } } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { return LLM_CHAT_TEMPLATE_PHI_3; + } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { + return LLM_CHAT_TEMPLATE_FALCON_3; } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) { return LLM_CHAT_TEMPLATE_ZEPHYR; } else if (tmpl_contains("bos_token + message['role']")) { @@ -22371,6 +22380,15 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) { + // Falcon 3 + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>\n" << message->content << "\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) { // zephyr template for (auto message : chat) { From fc055407b7c557f8e935aa2191485b0de967e2a1 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Wed, 18 Dec 2024 04:58:00 +0000 Subject: [PATCH 2/7] Add fix for adding bos to added special tokens --- convert_hf_to_gguf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 66e268af61419..77ab5ef4ae6b6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -527,7 +527,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: if token in added_vocab: # We need to manually encode and decode the added tokens in case special characters # used for `\n` / `\t` have been manually added in the added tokens - token = tokenizer.decode(tokenizer.encode(token)) + if len(token) == 1: + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: From b3d022aa1a352b39797ae7367448759fef631084 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Wed, 18 Dec 2024 05:46:07 +0000 Subject: [PATCH 3/7] Add comment explaining the logic behind the if statement --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 77ab5ef4ae6b6..1549022523f9e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -527,6 +527,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: if token in added_vocab: # We need to manually encode and decode the added tokens in case special characters # used for `\n` / `\t` have been manually added in the added tokens + # To avoid unexpected issues - we make sure to encode single-char tokens if len(token) == 1: token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) From d8d2f370dc97f3bba3ecc1f9fc6b6853a0794028 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Wed, 18 Dec 2024 07:23:35 +0000 Subject: [PATCH 4/7] Add a log message to better track the when the following line of code is triggered --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1549022523f9e..cd5dd9435bece 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -529,6 +529,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # used for `\n` / `\t` have been manually added in the added tokens # To avoid unexpected issues - we make sure to encode single-char tokens if len(token) == 1: + logger.info("Ecode-Decode special characters using AutoTokenizer") token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): From 92e41ec4b993c75cc6cb4fa92f7d233084741bb8 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Wed, 18 Dec 2024 08:20:28 +0000 Subject: [PATCH 5/7] Update log to only print when input and output characters are different --- convert_hf_to_gguf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cd5dd9435bece..06e3016cc9e5c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -529,8 +529,10 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # used for `\n` / `\t` have been manually added in the added tokens # To avoid unexpected issues - we make sure to encode single-char tokens if len(token) == 1: - logger.info("Ecode-Decode special characters using AutoTokenizer") + previous_token = token token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) From a1f146dba1126c6557d9c7c8696753aba87ec5e4 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Sun, 22 Dec 2024 20:12:46 +0000 Subject: [PATCH 6/7] Fix handling pre-normalized tokens --- convert_hf_to_gguf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 06e3016cc9e5c..a55bedc72cdfe 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -525,10 +525,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: else: token: str = reverse_vocab[i] if token in added_vocab: - # We need to manually encode and decode the added tokens in case special characters - # used for `\n` / `\t` have been manually added in the added tokens - # To avoid unexpected issues - we make sure to encode single-char tokens - if len(token) == 1: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not tokenizer.added_tokens_decoder[i].normalized: previous_token = token token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) if previous_token != token: @@ -537,6 +536,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: + # NOTE: this was added for Gemma. + # Encoding and decoding the tokens above isn't sufficient for this case. token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces toktypes.append(gguf.TokenType.USER_DEFINED) else: From 64d8687e22997f085ec811223b09ead34d8037bb Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Sun, 22 Dec 2024 20:39:40 +0000 Subject: [PATCH 7/7] Refactoring --- src/llama.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 1cc8a93323b4a..00011e84255cf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6460,7 +6460,8 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "llama3" || tokenizer_pre == "llama-v3" || - tokenizer_pre == "llama-bpe") { + tokenizer_pre == "llama-bpe"|| + tokenizer_pre == "falcon3") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; vocab.tokenizer_ignore_merges = true; vocab.tokenizer_add_bos = true; @@ -6475,11 +6476,6 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "falcon") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; - } else if ( - tokenizer_pre == "falcon3") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; - vocab.tokenizer_ignore_merges = true; - vocab.tokenizer_add_bos = true; } else if ( tokenizer_pre == "mpt") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;