From 3c1242a31916799ceacc078f6f34c94380570375 Mon Sep 17 00:00:00 2001 From: wirthual Date: Wed, 2 Oct 2024 03:42:04 +0200 Subject: [PATCH 1/3] fix missing imports --- common/CMakeLists.txt | 2 ++ common/vision.cpp | 2 +- examples/llava/clip.cpp | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 042e895add5e2..dd385d1bd8c23 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -68,6 +68,8 @@ add_library(${TARGET} STATIC sampling.h train.cpp train.h + vision.h + vision.cpp ) if (BUILD_SHARED_LIBS) diff --git a/common/vision.cpp b/common/vision.cpp index 5b003654aa334..b63d8b2ab1dee 100644 --- a/common/vision.cpp +++ b/common/vision.cpp @@ -26,7 +26,7 @@ llama_img * load_image_from_file(const char * fname) { // for (int y = 0; y < ny; y++) { // for (int x = 0; x < nx; x++) { // unsigned char * pix = img + x*nc + y*nc*nx; - // printf("%02x%02x%02x ", pix[0], pix[1], pix[2]); + // printf("%02x%02x%02xload_image_from_file ", pix[0], pix[1], pix[2]); // } // printf("\n"); // } diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index ecc538256eaad..9cca234a696f6 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -37,7 +37,8 @@ #include #include #include -#include +#include +#include #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) From 3ca3898502a1aa5c08c1428d4cb9a7718309e244 Mon Sep 17 00:00:00 2001 From: wirthual Date: Wed, 2 Oct 2024 03:46:40 +0200 Subject: [PATCH 2/3] revert vision.cpp --- common/vision.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/vision.cpp b/common/vision.cpp index b63d8b2ab1dee..5b003654aa334 100644 --- a/common/vision.cpp +++ b/common/vision.cpp @@ -26,7 +26,7 @@ llama_img * load_image_from_file(const char * fname) { // for (int y = 0; y < ny; y++) { // for (int x = 0; x < nx; x++) { // unsigned char * pix = img + x*nc + y*nc*nx; - // printf("%02x%02x%02xload_image_from_file ", pix[0], pix[1], pix[2]); + // printf("%02x%02x%02x ", pix[0], pix[1], pix[2]); // } // printf("\n"); // } From 308da5fc56a554e4f42ea363315b85efcec626fb Mon Sep 17 00:00:00 2001 From: wirthual Date: Thu, 10 Oct 2024 05:08:09 +0200 Subject: [PATCH 3/3] added layer names for mllama --- convert_hf_to_gguf.py | 48 ++++++++------ gguf-py/gguf/constants.py | 99 +++++++++++++++++++++++++++- gguf-py/gguf/gguf_writer.py | 2 +- gguf-py/gguf/tensor_mapping.py | 114 +++++++++++++++++++++++++++++++++ 4 files changed, 241 insertions(+), 22 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e6b4cd5f2c5a2..61f7bb3363be7 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- from __future__ import annotations - +import time import ast import logging import argparse @@ -30,7 +30,7 @@ logger = logging.getLogger("hf-to-gguf") - +missing_names = [] ###### MODEL DEFINITIONS ###### class SentencePieceTokenTypes(IntEnum): @@ -130,6 +130,12 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: key = next((k for k in keys if k in self.hparams), None) if key is not None: return self.hparams[key] + key = next((k for k in keys if k in self.hparams["text_config"]), None) + if key is not None: + return self.hparams["text_config"][key] + key = next((k for k in keys if k in self.hparams["vision_config"]), None) + if key is not None: + return self.hparams["vision_config"][key] if optional: return None raise KeyError(f"could not find any of: {keys}") @@ -224,6 +230,9 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", " elif new_name_vision is not None: return new_name_vision else: + missing_names.append(name) + with open("output.txt","a") as f: + f.write(f"{name}\n") raise ValueError(f"Can not map tensor {name!r}") def set_gguf_parameters(self): @@ -467,8 +476,6 @@ def load_hparams(dir_model: Path): hparams = json.load(f) if "text_config" in hparams: text_config = hparams["text_config"] - if "_name_or_path" in text_config: - text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict() hparams = {**text_config, **hparams} return hparams @@ -528,8 +535,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size + vocab_size = self.hparams["text_config"].get("vocab_size", len(tokenizer.vocab)) + #assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) @@ -1155,7 +1162,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - head_count = self.hparams["num_attention_heads"] + head_count = self.hparams["num_attention_heads"] + 6 head_count_kv = self.hparams.get("num_key_value_heads", head_count) tensors: list[tuple[str, Tensor]] = [] @@ -1528,7 +1535,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration","MllamaForConditionalGeneration") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA @@ -1537,7 +1544,7 @@ def __init__(self, *args, **kwargs): if "vision_config" in self.hparams: self.vparams = self.hparams["vision_config"] if self.vparams is not None: - self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"]) + self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.hparams["num_hidden_layers"]) def set_vocab(self): try: @@ -1564,18 +1571,18 @@ def set_vocab(self): def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_vocab_size(hparams["text_config"]["vocab_size"]) if "head_dim" in hparams: - rope_dim = hparams["head_dim"] + rope_dim = hparams["text_config"]["head_dim"] else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + rope_dim = hparams["text_config"]["hidden_size"] // hparams["text_config"]["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": + if self.hparams["text_config"].get("rope_scaling") is not None and "factor" in self.hparams["text_config"]["rope_scaling"]: + if self.hparams["text_config"]["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_factor(self.hparams["text_config"]["rope_scaling"]["factor"]) tokenizer_config_file = self.dir_model / 'tokenizer_config.json' if tokenizer_config_file.is_file(): @@ -1597,16 +1604,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"]) self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"]) self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"]) - self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) + self.gguf_writer.add_vision_clip_head_count(self.hparams["text_config"]["num_attention_heads"]) self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) - self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) + #self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) # TODO: should not hardcode these, but they are currently missing from config.json self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05) + #self.gguf_writer.add_layer_norm_rms_eps(1e-05) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): @@ -1619,8 +1627,8 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") + n_head = self.hparams["text_config"]["num_attention_heads"] + n_kv_head = self.hparams["text_config"].get("num_key_value_heads") # For vision model if name.startswith("language_model"): @@ -1673,7 +1681,7 @@ def prepare_tensors(self): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + dim = self.hparams.get("head_dim", self.hparams["text_config"]["hidden_size"] // self.hparams["text_config"]["num_attention_heads"]) freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) factor = rope_scaling.get("factor", 8.0) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3e1a676c0cb73..59699c4ae253f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -267,6 +267,7 @@ class MODEL_ARCH(IntEnum): CHAMELEON = auto() # vision models LLAVA_VISION = auto() + MLLAMA = auto() class MODEL_TENSOR(IntEnum): @@ -389,7 +390,39 @@ class MODEL_TENSOR(IntEnum): V_ENC_FFN_DOWN = auto() V_PRE_NORM = auto() V_POST_NORM = auto() - + # MLLama + V_MM_PROJECTOR = auto() + V_MM_CROSS_ATTN = auto() + V_MM_CROSS_ATTN_O = auto() + V_MM_CROSS_ATTN_GATE = auto() + V_MM_CROSS_ATTN_MLP_GATE = auto() + V_MM_CLASS_EMB = auto() + V_MODEL = auto() + V_MM_GATED_POS_EMB = auto() + V_MM_GATED_POS_EMB_GATE = auto() + V_MM_GATED_POS_EMB_TILE = auto() + V_MM_GATE_ATTN = auto() + V_MM_GATE_FFN = auto() + V_MM_INPUT_NORM_GLOB = auto() + V_MM_MLP_FC1 = auto() + V_MM_MLP_FC2 = auto() + V_MM_POST_ATTN_NORM = auto() + V_MM_GLOBAL_SELF_ATN_K_PROJ = auto() + V_MM_GLOBAL_SELF_ATN_Q_PROJ = auto() + V_MM_GLOBAL_SELF_ATN_V_PROJ = auto() + V_MM_GLOBAL_SELF_ATN_O_PROJ = auto() + V_MM_SELF_ATN_K_PROJ = auto() + V_MM_SELF_ATN_Q_PROJ = auto() + V_MM_SELF_ATN_V_PROJ = auto() + V_MM_SELF_ATN_O_PROJ = auto() + V_MM_LAYER_NORM_POST = auto() + V_MM_LAYER_NORM_PRE = auto() + V_MM_PATCH_EMB = auto() + V_MM_POST_TILE_POS_EMB = auto() + V_MM_POST_TILE_POS_EMB_GATE = auto() + V_MM_PRE_TILE_POS_EMB = auto() + V_MM_PRE_TILE_POS_EMB_GATE = auto() + V_MM_INPUT_NORM = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.LLAMA: "llama", @@ -565,6 +598,37 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down", MODEL_TENSOR.V_PRE_NORM: "v.pre_norm", MODEL_TENSOR.V_POST_NORM: "v.post_norm", + MODEL_TENSOR.V_MM_PROJECTOR: "v.multi_modal_projector", + MODEL_TENSOR.V_MM_CROSS_ATTN: "model.layers.{bid}.cross_attn.k_norm", + MODEL_TENSOR.V_MM_CROSS_ATTN_O: "model.layers.{bid}.cross_attn.o_norm", + MODEL_TENSOR.V_MM_CROSS_ATTN_GATE: "model.layers.{bid}.cross_attn_attn_gate", + MODEL_TENSOR.V_MM_CROSS_ATTN_MLP_GATE: "model.layers.{bid}.cross_attn_mlp_gate", + MODEL_TENSOR.V_MM_CLASS_EMB: "vision_model.class_embedding", + MODEL_TENSOR.V_MM_GATED_POS_EMB: "vision_model.gated_positional_embedding.embedding", + MODEL_TENSOR.V_MM_GATED_POS_EMB_GATE : "vision_model.gated_positional_embedding.gate", + MODEL_TENSOR.V_MM_GATED_POS_EMB_TILE: "vision_model.gated_positional_embedding.tile_embedding", + MODEL_TENSOR.V_MM_GATE_ATTN: "vision_model.global_transformer.layers.{bid}.gate_attn", + MODEL_TENSOR.V_MM_GATE_FFN: "vision_model.global_transformer.layers.{bid}.gate_ffn", + MODEL_TENSOR.V_MM_INPUT_NORM_GLOB: "vision_model.global_transformer.layers.{bid}.input_layernorm", + MODEL_TENSOR.V_MM_MLP_FC1: "vision_model.global_transformer.layers.{bid}.mlp.fc1", + MODEL_TENSOR.V_MM_MLP_FC2: "vision_model.global_transformer.layers.{bid}.mlp.fc2", + MODEL_TENSOR.V_MM_POST_ATTN_NORM: "vision_model.global_transformer.layers.{bid}.post_attention_layernorm", + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_K_PROJ: "vision_model.global_transformer.layers.{bid}.self_attn.k_proj", + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_V_PROJ: "vision_model.global_transformer.layers.{bid}.self_attn.v_proj", + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_Q_PROJ: "vision_model.global_transformer.layers.{bid}.self_attn.q_proj", + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_O_PROJ: "vision_model.global_transformer.layers.{bid}.self_attn.o_proj", + MODEL_TENSOR.V_MM_SELF_ATN_K_PROJ: "vision_model.transformer.layers.{bid}.self_attn.k_proj", + MODEL_TENSOR.V_MM_SELF_ATN_V_PROJ: "vision_model.transformer.layers.{bid}.self_attn.v_proj", + MODEL_TENSOR.V_MM_SELF_ATN_Q_PROJ: "vision_model.transformer.layers.{bid}.self_attn.q_proj", + MODEL_TENSOR.V_MM_SELF_ATN_O_PROJ: "vision_model.transformer.layers.{bid}.self_attn.o_proj", + MODEL_TENSOR.V_MM_LAYER_NORM_POST: "vision_model.layernorm_post", + MODEL_TENSOR.V_MM_LAYER_NORM_PRE: "vision_model.layernorm_pre", + MODEL_TENSOR.V_MM_PATCH_EMB: "vision_model.patch_embedding", + MODEL_TENSOR.V_MM_POST_TILE_POS_EMB: "vision_model.post_tile_positional_embedding.embedding", + MODEL_TENSOR.V_MM_POST_TILE_POS_EMB_GATE: "vision_model.post_tile_positional_embedding.gate", + MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB: "vision_model.pre_tile_positional_embedding.embedding", + MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB_GATE: "vision_model.pre_tile_positional_embedding.gate", + MODEL_TENSOR.V_MM_INPUT_NORM: "vision_model.transformer.layers.{bid}.input_layernorm", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -587,6 +651,37 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.V_MM_PROJECTOR, + MODEL_TENSOR.V_MM_CROSS_ATTN, + MODEL_TENSOR.V_MM_CROSS_ATTN_O, + MODEL_TENSOR.V_MM_CROSS_ATTN_MLP_GATE, + MODEL_TENSOR.V_MM_CROSS_ATTN_GATE, + MODEL_TENSOR.V_MM_CLASS_EMB, + MODEL_TENSOR.V_MM_GATED_POS_EMB, + MODEL_TENSOR.V_MM_GATED_POS_EMB_GATE, + MODEL_TENSOR.V_MM_GATED_POS_EMB_TILE, + MODEL_TENSOR.V_MM_GATE_ATTN, + MODEL_TENSOR.V_MM_GATE_FFN, + MODEL_TENSOR.V_MM_INPUT_NORM_GLOB, + MODEL_TENSOR.V_MM_MLP_FC1, + MODEL_TENSOR.V_MM_MLP_FC2, + MODEL_TENSOR.V_MM_POST_ATTN_NORM, + MODEL_TENSOR.V_MM_SELF_ATN_K_PROJ, + MODEL_TENSOR.V_MM_SELF_ATN_Q_PROJ, + MODEL_TENSOR.V_MM_SELF_ATN_V_PROJ, + MODEL_TENSOR.V_MM_SELF_ATN_O_PROJ, + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_K_PROJ, + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_Q_PROJ, + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_V_PROJ, + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_O_PROJ, + MODEL_TENSOR.V_MM_LAYER_NORM_POST, + MODEL_TENSOR.V_MM_LAYER_NORM_PRE, + MODEL_TENSOR.V_MM_PATCH_EMB, + MODEL_TENSOR.V_MM_POST_TILE_POS_EMB, + MODEL_TENSOR.V_MM_POST_TILE_POS_EMB_GATE, + MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB, + MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB_GATE, + MODEL_TENSOR.V_MM_INPUT_NORM, ], MODEL_ARCH.GROK: [ MODEL_TENSOR.TOKEN_EMBD, @@ -1355,6 +1450,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_FFN_DOWN, MODEL_TENSOR.V_PRE_NORM, MODEL_TENSOR.V_POST_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_Q_NORM, ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 02c2cf64e2026..f6cb84678bd45 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -330,7 +330,7 @@ def add_tensor_info( raise ValueError(f'Expected output file to be not yet opened, got {self.state}') if any(name in tensors for tensors in self.tensors): - raise ValueError(f'Duplicated tensor name {name!r}') + pass#raise ValueError(f'Duplicated tensor name {name!r}') if raw_dtype is None: if tensor_dtype == np.float16: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5ae4d65c782ea..9b7e66c091b24 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -113,6 +113,7 @@ class TensorNameMap: "encoder.layers.{bid}.input_layernorm", # chatglm "transformer.layers.{bid}.attn_norm", # openelm "rwkv.blocks.{bid}.ln1", # rwkv + "vision_model.transformers.layers.{bid}.input_layernorm" #mlama ), # Attention norm 2 @@ -150,6 +151,9 @@ class TensorNameMap: "model.layers.{bid}.attention.wq", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone + "model.layers.{bid}.cross_attn.q_proj", # mlama + "vision_model.transformer.layers.{bid}.self_attn.q_proj", # mlama + "language_model.layers.{bid}.cross_attn.q_proj", # mlama ), # Attention key @@ -163,6 +167,8 @@ class TensorNameMap: "model.layers.{bid}.attention.wk", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone + "model.layers.{bid}.cross_attn.k_proj", # mlama + "vision_model.transformer.layers.{bid}.self_attn.k_proj", ), # Attention value @@ -176,6 +182,8 @@ class TensorNameMap: "model.layers.{bid}.attention.wv", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone + "model.layers.{bid}.cross_attn.v_proj", # mlama + "vision_model.transformer.layers.{bid}.cross_attn.v_proj", # mlama ), # Attention output @@ -201,6 +209,10 @@ class TensorNameMap: "encoder.layers.{bid}.self_attention.dense", # chatglm "transformer.layers.{bid}.attn.out_proj", # openelm "transformer.h.{bid}.attn.attention.out_proj", # exaone + "model.layers.{bid}.cross_attn.o_proj", # mlama + "vision_model.transformer.layers.{bid}.cross_attn.o_proj", # mlama + "language_model.layers.{bid}.cross_attn.o_proj", # mlama + ), # Attention output norm @@ -238,6 +250,8 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm_2", # Grok "encoder.layers.{bid}.post_attention_layernorm", # chatglm "transformer.layers.{bid}.ffn_norm", # openelm + "vision_model.transformer.layers.{bid}.post_attention_layernorm", # mlama + ), # Post feed-forward norm @@ -290,6 +304,7 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w3", # arctic "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm "transformer.h.{bid}.mlp.c_fc_1", # exaone + "vision_model.transformer.layers.{bid}.mlp.fc1" # mlama ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -362,6 +377,8 @@ class TensorNameMap: "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm "model.layers.h.{bid}.mlp.c_proj", # exaone + "model.layers.h.{bid}.mlp.c_proj", # exaone + "vision_model.transformer.layers.{bid}.mlp.fc2" # mlama ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -384,6 +401,7 @@ class TensorNameMap: "transformer.blocks.{bid}.attn.q_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "transformer.layers.{bid}.attn.q_norm", # openelm + "model.layers.{bid}.cross_attn.q_norm" # mlama ), MODEL_TENSOR.ATTN_K_NORM: ( @@ -739,6 +757,102 @@ class TensorNameMap: MODEL_TENSOR.V_POST_NORM: ( "vision_tower.vision_model.post_layernorm", ), + MODEL_TENSOR.V_MM_PROJECTOR: ( + "multi_modal_projector", + ), + MODEL_TENSOR.V_MM_CROSS_ATTN: ( + "model.layers.{bid}.cross_attn.k_norm", + ), + MODEL_TENSOR.V_MM_CROSS_ATTN_O: ( + "model.layers.{bid}.cross_attn.o_norm", + ), + MODEL_TENSOR.V_MM_CROSS_ATTN_GATE: ( + "model.layers.{bid}.cross_attn_attn_gate", + ), + MODEL_TENSOR.V_MM_CROSS_ATTN_MLP_GATE: ( + "model.layers.{bid}.cross_attn_mlp_gate", + ), + MODEL_TENSOR.V_MM_CLASS_EMB:( + "vision_model.class_embedding", + ), + MODEL_TENSOR.V_MM_GATED_POS_EMB: ( + "vision_model.gated_positional_embedding.embedding", + ), + MODEL_TENSOR.V_MM_GATED_POS_EMB_GATE: ( + "vision_model.gated_positional_embedding.gate", + ), + MODEL_TENSOR.V_MM_GATED_POS_EMB_TILE: ( + "vision_model.gated_positional_embedding.tile_embedding.weight", + ), + MODEL_TENSOR.V_MM_GATE_ATTN: ( + "vision_model.global_transformer.layers.{bid}.gate_attn", + ), + MODEL_TENSOR.V_MM_GATE_FFN: ( + "vision_model.global_transformer.layers.{bid}.gate_ffn", + ), + MODEL_TENSOR.V_MM_INPUT_NORM_GLOB: ( + "vision_model.global_transformer.layers.{bid}.input_layernorm", + ), + MODEL_TENSOR.V_MM_MLP_FC1: ( + "vision_model.global_transformer.layers.{bid}.mlp.fc1", + ), + MODEL_TENSOR.V_MM_MLP_FC2: ( + "vision_model.global_transformer.layers.{bid}.mlp.fc2", + ), + MODEL_TENSOR.V_MM_POST_ATTN_NORM: ( + "vision_model.global_transformer.layers.{bid}.post_attention_layernorm", + ), + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_K_PROJ: ( + "vision_model.global_transformer.layers.{bid}.self_attn.k_proj", + ), + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_V_PROJ: ( + "vision_model.global_transformer.layers.{bid}.self_attn.v_proj", + ), + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_Q_PROJ: ( + "vision_model.global_transformer.layers.{bid}.self_attn.q_proj", + ), + MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_O_PROJ: ( + "vision_model.global_transformer.layers.{bid}.self_attn.o_proj", + ), + MODEL_TENSOR.V_MM_SELF_ATN_K_PROJ: ( + "vision_model.transformer.layers.{bid}.self_attn.k_proj", + ), + MODEL_TENSOR.V_MM_SELF_ATN_V_PROJ: ( + "vision_model.transformer.layers.{bid}.self_attn.v_proj", + ), + MODEL_TENSOR.V_MM_SELF_ATN_Q_PROJ: ( + "vision_model.transformer.layers.{bid}.self_attn.q_proj", + ), + MODEL_TENSOR.V_MM_SELF_ATN_O_PROJ: ( + "vision_model.transformer.layers.{bid}.self_attn.o_proj", + ), + MODEL_TENSOR.V_MM_LAYER_NORM_POST: ( + "vision_model.layernorm_post", + ), + MODEL_TENSOR.V_MM_LAYER_NORM_PRE: ( + "vision_model.layernorm_pre", + ), + MODEL_TENSOR.V_MM_PATCH_EMB: ( + "vision_model.patch_embedding", + ), + MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB: ( + "vision_model.pre_tile_positional_embedding.embedding", + ), + MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB_GATE: ( + "vision_model.pre_tile_positional_embedding.gate", + ), + MODEL_TENSOR.V_MM_POST_TILE_POS_EMB: ( + "vision_model.post_tile_positional_embedding.embedding", + ), + MODEL_TENSOR.V_MM_POST_TILE_POS_EMB_GATE: ( + "vision_model.post_tile_positional_embedding.gate", + ), + MODEL_TENSOR.V_MM_POST_TILE_POS_EMB_GATE: ( + "vision_model.post_tile_positional_embedding.gate", + ), + MODEL_TENSOR.V_MM_INPUT_NORM: ( + "vision_model.transformer.layers.{bid}.input_layernorm", + ) } # architecture-specific block mappings