From 5ebc5ef572a0a6f30930cc52f65145e877071b95 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 4 Sep 2024 12:16:21 -0600 Subject: [PATCH 1/9] feat(gguf-py): Add Granite model and params to gguf-py Branch: GraniteLM Signed-off-by: Gabe Goodhart --- gguf-py/gguf/constants.py | 18 ++++++++++++++++++ gguf-py/gguf/gguf_writer.py | 9 +++++++++ 2 files changed, 27 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0d88649d84834..88619094ae408 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -97,6 +97,8 @@ class LLM: RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" + RESIDUAL_MULTIPLIER = "{arch}.residual_multiplier" + EMBEDDING_MULTIPLIER = "{arch}.embedding_multiplier" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -112,6 +114,7 @@ class Attention: KV_LORA_RANK = "{arch}.attention.kv_lora_rank" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" + MULTIPLIER = "{arch}.attention.multiplier" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -231,6 +234,7 @@ class MODEL_ARCH(IntEnum): JAIS = auto() NEMOTRON = auto() EXAONE = auto() + GRANITE = auto() class MODEL_TENSOR(IntEnum): @@ -387,6 +391,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.JAIS: "jais", MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.EXAONE: "exaone", + MODEL_ARCH.GRANITE: "granite", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -1224,6 +1229,19 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.GRANITE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 3c95c26730f7a..aed56ac9679a5 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -679,6 +679,12 @@ def add_time_mix_extra_dim(self, dim: int) -> None: def add_time_decay_extra_dim(self, dim: int) -> None: self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim) + def add_residual_multiplier(self, value: float) -> None: + self.add_float32(Keys.LLM.RESIDUAL_MULTIPLIER.format(arch=self.arch), value) + + def add_embedding_multiplier(self, value: float) -> None: + self.add_float32(Keys.LLM.EMBEDDING_MULTIPLIER.format(arch=self.arch), value) + def add_wkv_head_size(self, size: int) -> None: self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) @@ -703,6 +709,9 @@ def add_relative_attn_buckets_count(self, value: int) -> None: def add_sliding_window(self, value: int) -> None: self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value) + def add_attention_multiplier(self, value: float) -> None: + self.add_float32(Keys.Attention.MULTIPLIER.format(arch=self.arch), value) + def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) From 406833d77921de64d704807582c3f422221eed67 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 4 Sep 2024 12:16:56 -0600 Subject: [PATCH 2/9] feat(convert_hf_to_gguf): Add registration and param setup for Granite Branch: GraniteLM Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c7e6ae0caae0e..56e86011bb216 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4080,6 +4080,34 @@ def prepare_tensors(self): super().prepare_tensors() +@Model.register("GraniteForCausalLM") +class GraniteModel(Model): + """Conversion for IBM's GraniteForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE + + def set_gguf_parameters(self): + """Granite uses standard llama parameters with the following differences: + + - No head_dim support + - New multiplier params: + - attention_multiplier + - embedding_multiplier + - residual_multiplier + - logits_scaling + """ + if head_dim := self.hparams.pop("head_dim", None): + logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) + super().set_gguf_parameters() + if attention_multiplier := self.hparams.get("attention_multiplier"): + self.gguf_writer.add_attention_multiplier(attention_multiplier) + if embedding_multiplier := self.hparams.get("embedding_multiplier"): + self.gguf_writer.add_embedding_multiplier(embedding_multiplier) + if residual_multiplier := self.hparams.get("residual_multiplier"): + self.gguf_writer.add_residual_multiplier(residual_multiplier) + if logits_scaling := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scaling) + + ###### CONVERSION LOGIC ###### # tree of lazy tensors From 383065ade6d4504643bd62c405477c83b9b6b153 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Thu, 5 Sep 2024 12:06:34 -0600 Subject: [PATCH 3/9] feat(llama.cpp): Add config parsing for Granite multiplier params Branch: GraniteLM Signed-off-by: Gabe Goodhart --- src/llama.cpp | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 0da764f9d1186..9d3720d2d95d5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -214,6 +214,7 @@ enum llm_arch { LLM_ARCH_NEMOTRON, LLM_ARCH_EXAONE, LLM_ARCH_RWKV6, + LLM_ARCH_GRANITE, LLM_ARCH_UNKNOWN, }; @@ -264,6 +265,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_RWKV6, "rwkv6" }, + { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -303,6 +305,8 @@ enum llm_kv { LLM_KV_RESCALE_EVERY_N_LAYERS, LLM_KV_TIME_MIX_EXTRA_DIM, LLM_KV_TIME_DECAY_EXTRA_DIM, + LLM_KV_RESIDUAL_MULTIPLIER, + LLM_KV_EMBEDDING_MULTIPLIER, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -317,6 +321,7 @@ enum llm_kv { LLM_KV_ATTENTION_KV_LORA_RANK, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, + LLM_KV_ATTENTION_MULTIPLIER, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, @@ -407,6 +412,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" }, + { LLM_KV_RESIDUAL_MULTIPLIER, "%s.residual_multiplier" }, + { LLM_KV_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -421,6 +428,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, + { LLM_KV_ATTENTION_MULTIPLIER, "%s.attention.multiplier" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, @@ -2372,6 +2380,11 @@ struct llama_hparams { float f_max_alibi_bias = 0.0f; float f_logit_scale = 0.0f; + // For Granite architecture + float f_residual_multiplier = 0.0f; + float f_embedding_multiplier = 0.0f; + float f_attention_multiplier = 0.0f; + bool causal_attn = true; bool use_alibi = false; bool attn_soft_cap = false; @@ -2427,13 +2440,16 @@ struct llama_hparams { const float EPSILON = 1e-9f; - if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; - if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; - if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true; - if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; - if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; - if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; - if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; + if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; + if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; + if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true; + if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; + if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; + if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; + if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; + if (!is_float_close(this->f_residual_multiplier, other.f_residual_multiplier, EPSILON)) return true; + if (!is_float_close(this->f_embedding_multiplier, other.f_embedding_multiplier, EPSILON)) return true; + if (!is_float_close(this->f_attention_multiplier, other.f_attention_multiplier, EPSILON)) return true; return false; } @@ -5406,6 +5422,7 @@ static void llm_load_hparams( // arch-specific KVs switch (model.arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_GRANITE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5422,13 +5439,20 @@ static void llm_load_hparams( // granite uses a vocab with len 49152 case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break; case 36: model.type = e_model::MODEL_8B; break; // granite - case 40: model.type = e_model::MODEL_13B; break; + case 40: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : e_model::MODEL_13B; break; case 48: model.type = e_model::MODEL_34B; break; case 60: model.type = e_model::MODEL_30B; break; case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break; default: model.type = e_model::MODEL_UNKNOWN; } } + // Extra multipliers for Granite architecture + if (model.arch == LLM_ARCH_GRANITE) { + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER, hparams.f_residual_multiplier); + ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER, hparams.f_embedding_multiplier); + ml.get_key(LLM_KV_ATTENTION_MULTIPLIER, hparams.f_attention_multiplier); + } } break; case LLM_ARCH_MINICPM: { @@ -6717,6 +6741,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } + + if (model.arch == LLM_ARCH_GRANITE) { + LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier); + LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier); + LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier); + } } // Returns false if cancelled by progress_callback From ec13f29b7355e0da6282ca027a0751cd71c2e967 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Thu, 5 Sep 2024 16:43:01 -0600 Subject: [PATCH 4/9] feat(llama.cpp): First pass at full port of granite deviations from llama Something is still not working right since the results are mostly terrible, but on occasion it's producing relevant results at this point, so _something_ is working. Branch: GraniteLM Signed-off-by: Gabe Goodhart --- src/llama.cpp | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 9d3720d2d95d5..e8eba4e397cdf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1462,6 +1462,22 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" }, }, }, + { + LLM_ARCH_GRANITE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -6915,6 +6931,7 @@ static bool llm_load_tensors( case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: case LLM_ARCH_MINICPM: + case LLM_ARCH_GRANITE: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -8898,6 +8915,11 @@ static struct ggml_tensor * llm_build_inp_embd( ggml_set_input(lctx.inp_embd); } + // For Granite architecture + if (hparams.f_embedding_multiplier != 0.0f) { + inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier); + } + cb(inpL, "inp_embd", -1); return inpL; @@ -10176,6 +10198,7 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10228,7 +10251,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -10239,6 +10262,11 @@ struct llm_build_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } + // For Granite architecture + if (hparams.f_residual_multiplier) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -10275,6 +10303,11 @@ struct llm_build_context { cb(cur, "ffn_moe_out", il); } + // For Granite architecture + if (hparams.f_residual_multiplier) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier); + } + cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -10294,6 +10327,12 @@ struct llm_build_context { // lm_head cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + // For Granite architecture + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + } + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -15819,6 +15858,7 @@ static struct ggml_cgraph * llama_build_graph( switch (model.arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_GRANITE: { result = llm.build_llama(); } break; @@ -19115,6 +19155,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_CHATGLM: + case LLM_ARCH_GRANITE: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 From e73d795eff97ca2ab8083970c59030dfa2860f73 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 9 Sep 2024 09:03:09 -0600 Subject: [PATCH 5/9] fix(llama.cpp): Determine granite language 3b instruct by vocab size Branch: GraniteLM Signed-off-by: Gabe Goodhart --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index e8eba4e397cdf..9ad4e261e14d0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5455,7 +5455,7 @@ static void llm_load_hparams( // granite uses a vocab with len 49152 case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break; case 36: model.type = e_model::MODEL_8B; break; // granite - case 40: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : e_model::MODEL_13B; break; + case 40: model.type = (hparams.n_vocab == 49152 || hparams.n_vocab == 49156) ? e_model::MODEL_3B : e_model::MODEL_13B; break; case 48: model.type = e_model::MODEL_34B; break; case 60: model.type = e_model::MODEL_30B; break; case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break; From 80863806a341b44e340d047af7f04e1a41b38edd Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 10 Sep 2024 09:36:44 -0600 Subject: [PATCH 6/9] fix(convert_hf_to_gguf): Use LlamaModel as base for GraniteModel The defaults in LlamaModel are needed for Granite as well Branch: GraniteLM Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 56e86011bb216..8530557d82d5f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4081,7 +4081,7 @@ def prepare_tensors(self): @Model.register("GraniteForCausalLM") -class GraniteModel(Model): +class GraniteModel(LlamaModel): """Conversion for IBM's GraniteForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE From 0bdf04e7b595383199e6d0cbf6651f169cca8abc Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 16 Sep 2024 08:55:58 -0600 Subject: [PATCH 7/9] fix(llama.cpp): Switch Granite param names to use _scale for consistency Other scalar multipliers are called *_scale, so this provides a more consistent naming convention. Branch: GraniteLM Signed-off-by: Gabe Goodhart --- src/llama.cpp | 66 +++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 9ad4e261e14d0..9c6703aad4171 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -305,8 +305,8 @@ enum llm_kv { LLM_KV_RESCALE_EVERY_N_LAYERS, LLM_KV_TIME_MIX_EXTRA_DIM, LLM_KV_TIME_DECAY_EXTRA_DIM, - LLM_KV_RESIDUAL_MULTIPLIER, - LLM_KV_EMBEDDING_MULTIPLIER, + LLM_KV_RESIDUAL_SCALE, + LLM_KV_EMBEDDING_SCALE, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -321,7 +321,7 @@ enum llm_kv { LLM_KV_ATTENTION_KV_LORA_RANK, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, - LLM_KV_ATTENTION_MULTIPLIER, + LLM_KV_ATTENTION_SCALE, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, @@ -412,8 +412,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" }, - { LLM_KV_RESIDUAL_MULTIPLIER, "%s.residual_multiplier" }, - { LLM_KV_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" }, + { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" }, + { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -428,7 +428,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, - { LLM_KV_ATTENTION_MULTIPLIER, "%s.attention.multiplier" }, + { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, @@ -2396,10 +2396,10 @@ struct llama_hparams { float f_max_alibi_bias = 0.0f; float f_logit_scale = 0.0f; - // For Granite architecture - float f_residual_multiplier = 0.0f; - float f_embedding_multiplier = 0.0f; - float f_attention_multiplier = 0.0f; + // Additional scale factors (Granite) + float f_residual_scale = 0.0f; + float f_embedding_scale = 0.0f; + float f_attention_scale = 0.0f; bool causal_attn = true; bool use_alibi = false; @@ -2456,16 +2456,16 @@ struct llama_hparams { const float EPSILON = 1e-9f; - if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; - if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; - if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true; - if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; - if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; - if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; - if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; - if (!is_float_close(this->f_residual_multiplier, other.f_residual_multiplier, EPSILON)) return true; - if (!is_float_close(this->f_embedding_multiplier, other.f_embedding_multiplier, EPSILON)) return true; - if (!is_float_close(this->f_attention_multiplier, other.f_attention_multiplier, EPSILON)) return true; + if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; + if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; + if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true; + if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; + if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; + if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; + if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; + if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true; + if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true; + if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true; return false; } @@ -5465,9 +5465,9 @@ static void llm_load_hparams( // Extra multipliers for Granite architecture if (model.arch == LLM_ARCH_GRANITE) { ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); - ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER, hparams.f_residual_multiplier); - ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER, hparams.f_embedding_multiplier); - ml.get_key(LLM_KV_ATTENTION_MULTIPLIER, hparams.f_attention_multiplier); + ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); } } break; case LLM_ARCH_MINICPM: @@ -6759,9 +6759,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { } if (model.arch == LLM_ARCH_GRANITE) { - LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier); - LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier); - LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier); + LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); + LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); + LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); } } @@ -8916,8 +8916,8 @@ static struct ggml_tensor * llm_build_inp_embd( } // For Granite architecture - if (hparams.f_embedding_multiplier != 0.0f) { - inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier); + if (hparams.f_embedding_scale != 0.0f) { + inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale); } cb(inpL, "inp_embd", -1); @@ -10198,7 +10198,7 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier; + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10263,8 +10263,8 @@ struct llm_build_context { } // For Granite architecture - if (hparams.f_residual_multiplier) { - cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier); + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -10304,8 +10304,8 @@ struct llm_build_context { } // For Granite architecture - if (hparams.f_residual_multiplier) { - cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier); + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); } cur = ggml_add(ctx0, cur, ffn_inp); From 65c5bb91abd6cdc00be7ed0d7c8d2d771a82f6d6 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 16 Sep 2024 08:56:56 -0600 Subject: [PATCH 8/9] fix(convert_hf_to_gguf/gguf-py): _multiplier -> _scale The transformers names with _multiplier will now be converted to the _scale equivalent during conversion. Branch: GraniteLM Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 20 +++++++++++--------- gguf-py/gguf/constants.py | 6 +++--- gguf-py/gguf/gguf_writer.py | 12 ++++++------ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8530557d82d5f..ff4c9226faedb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4090,20 +4090,22 @@ def set_gguf_parameters(self): - No head_dim support - New multiplier params: - - attention_multiplier - - embedding_multiplier - - residual_multiplier + - attention_scale + - embedding_scale + - residual_scale - logits_scaling """ if head_dim := self.hparams.pop("head_dim", None): logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) super().set_gguf_parameters() - if attention_multiplier := self.hparams.get("attention_multiplier"): - self.gguf_writer.add_attention_multiplier(attention_multiplier) - if embedding_multiplier := self.hparams.get("embedding_multiplier"): - self.gguf_writer.add_embedding_multiplier(embedding_multiplier) - if residual_multiplier := self.hparams.get("residual_multiplier"): - self.gguf_writer.add_residual_multiplier(residual_multiplier) + # NOTE: Convert _multiplier params to _scale params for naming + # consistency + if attention_scale := self.hparams.get("attention_multiplier"): + self.gguf_writer.add_attention_scale(attention_scale) + if embedding_scale := self.hparams.get("embedding_multiplier"): + self.gguf_writer.add_embedding_scale(embedding_scale) + if residual_scale := self.hparams.get("residual_multiplier"): + self.gguf_writer.add_residual_scale(residual_scale) if logits_scaling := self.hparams.get("logits_scaling"): self.gguf_writer.add_logit_scale(logits_scaling) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 88619094ae408..b36a60d497abd 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -97,8 +97,8 @@ class LLM: RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" - RESIDUAL_MULTIPLIER = "{arch}.residual_multiplier" - EMBEDDING_MULTIPLIER = "{arch}.embedding_multiplier" + RESIDUAL_SCALE = "{arch}.residual_scale" + EMBEDDING_SCALE = "{arch}.embedding_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -114,7 +114,7 @@ class Attention: KV_LORA_RANK = "{arch}.attention.kv_lora_rank" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" - MULTIPLIER = "{arch}.attention.multiplier" + SCALE = "{arch}.attention.scale" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index aed56ac9679a5..bd059b45c64d0 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -679,11 +679,11 @@ def add_time_mix_extra_dim(self, dim: int) -> None: def add_time_decay_extra_dim(self, dim: int) -> None: self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim) - def add_residual_multiplier(self, value: float) -> None: - self.add_float32(Keys.LLM.RESIDUAL_MULTIPLIER.format(arch=self.arch), value) + def add_residual_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value) - def add_embedding_multiplier(self, value: float) -> None: - self.add_float32(Keys.LLM.EMBEDDING_MULTIPLIER.format(arch=self.arch), value) + def add_embedding_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value) def add_wkv_head_size(self, size: int) -> None: self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) @@ -709,8 +709,8 @@ def add_relative_attn_buckets_count(self, value: int) -> None: def add_sliding_window(self, value: int) -> None: self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value) - def add_attention_multiplier(self, value: float) -> None: - self.add_float32(Keys.Attention.MULTIPLIER.format(arch=self.arch), value) + def add_attention_scale(self, value: float) -> None: + self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value) def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) From 5d054a42f96ec959fc9070ea83e160a8a2740225 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 16 Sep 2024 09:15:15 -0600 Subject: [PATCH 9/9] fix(llama.cpp): Use separate switch clause for granite in llm_load_hparams Branch: GraniteLM Signed-off-by: Gabe Goodhart --- src/llama.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 9c6703aad4171..79df86cf8dc4b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5438,7 +5438,6 @@ static void llm_load_hparams( // arch-specific KVs switch (model.arch) { case LLM_ARCH_LLAMA: - case LLM_ARCH_GRANITE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5455,20 +5454,13 @@ static void llm_load_hparams( // granite uses a vocab with len 49152 case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break; case 36: model.type = e_model::MODEL_8B; break; // granite - case 40: model.type = (hparams.n_vocab == 49152 || hparams.n_vocab == 49156) ? e_model::MODEL_3B : e_model::MODEL_13B; break; + case 40: model.type = e_model::MODEL_13B; break; case 48: model.type = e_model::MODEL_34B; break; case 60: model.type = e_model::MODEL_30B; break; case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break; default: model.type = e_model::MODEL_UNKNOWN; } } - // Extra multipliers for Granite architecture - if (model.arch == LLM_ARCH_GRANITE) { - ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); - ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); - ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); - ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); - } } break; case LLM_ARCH_MINICPM: { @@ -6059,6 +6051,20 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GRANITE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_3B; break; + // Add additional layer/vocab/etc checks here for other model sizes + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; }