diff --git a/mergekit/_data/architectures/exaone.json b/mergekit/_data/architectures/exaone.json new file mode 100644 index 00000000..e9024473 --- /dev/null +++ b/mergekit/_data/architectures/exaone.json @@ -0,0 +1,78 @@ +{ + "model_type": "exaone", + "architectures": [ + "ExaoneForCausalLM" + ], + "pre_weights": [ + { + "name": "transformer.wte.weight", + "is_embed": true, + "output_space": "running_residual" + } + ], + "num_layers_config_key": "num_hidden_layers", + "layer_templates": { + "weights": [ + { + "name": "transformer.h.${layer_index}.ln_1.weight", + "input_space": "running_residual" + }, + { + "name": "transformer.h.${layer_index}.attn.attention.q_proj.weight", + "input_space": "running_residual", + "output_space": "attn_qk_${layer_index}", + "head_split": "output", + "is_kq": true + }, + { + "name": "transformer.h.${layer_index}.attn.attention.k_proj.weight", + "input_space": "running_residual", + "output_space": "attn_qk_${layer_index}", + "head_split": "output", + "is_kq": true + }, + { + "name": "transformer.h.${layer_index}.attn.attention.v_proj.weight", + "input_space": "running_residual", + "output_space": "attn_v_${layer_index}", + "head_split": "output" + }, + { + "name": "transformer.h.${layer_index}.attn.attention.out_proj.weight", + "input_space": "attn_v_${layer_index}", + "output_space": "running_residual", + "head_split": "input" + }, + { + "name": "transformer.h.${layer_index}.ln_2.weight", + "input_space": "running_residual" + }, + { + "name": "transformer.h.${layer_index}.mlp.c_fc_0.weight", + "input_space": "running_residual", + "output_space": "up_${layer_index}" + }, + { + "name": "transformer.h.${layer_index}.mlp.c_fc_1.weight", + "input_space": "running_residual", + "output_space": "up_${layer_index}" + }, + { + "name": "transformer.h.${layer_index}.mlp.c_proj.weight", + "input_space": "up_${layer_index}", + "output_space": "running_residual" + } + ] + }, + "post_weights": [ + { + "name": "transformer.ln_f.weight", + "input_space": "running_residual" + }, + { + "name": "lm_head.weight", + "input_space": "running_residual", + "is_embed": true + } + ] +} diff --git a/mergekit/_data/architectures/solar.json b/mergekit/_data/architectures/solar.json new file mode 100644 index 00000000..7bd6a751 --- /dev/null +++ b/mergekit/_data/architectures/solar.json @@ -0,0 +1,81 @@ +{ + "model_type": "solar", + "architectures": [ + "SolarForCausalLM" + ], + "pre_weights": [ + { + "name": "model.embed_tokens.weight", + "is_embed": true, + "output_space": "running_residual" + } + ], + "num_layers_config_key": "num_hidden_layers", + "layer_templates": { + "weights": [ + { + "name": "model.layers.${layer_index}.input_layernorm.weight", + "input_space": "running_residual" + }, + { + "name": "model.layers.${layer_index}.self_attn.k_proj.weight", + "input_space": "running_residual", + "output_space": "attn_qk_${layer_index}", + "head_split": "output", + "is_kq": true + }, + { + "name": "model.layers.${layer_index}.self_attn.q_proj.weight", + "input_space": "running_residual", + "output_space": "attn_qk_${layer_index}", + "head_split": "output", + "is_kq": true + }, + { + "name": "model.layers.${layer_index}.self_attn.v_proj.weight", + "input_space": "running_residual", + "output_space": "attn_v_${layer_index}", + "head_split": "output" + }, + { + "name": "model.layers.${layer_index}.self_attn.o_proj.weight", + "input_space": "attn_v_${layer_index}", + "output_space": "running_residual", + "head_split": "input" + }, + { + "name": "model.layers.${layer_index}.post_attention_layernorm.weight", + "input_space": "running_residual" + }, + { + "name": "model.layers.${layer_index}.mlp.gate_proj.weight", + "input_space": "running_residual", + "output_space": "up_${layer_index}" + }, + { + "name": "model.layers.${layer_index}.mlp.up_proj.weight", + "input_space": "running_residual", + "output_space": "up_${layer_index}" + }, + { + "name": "model.layers.${layer_index}.mlp.down_proj.weight", + "input_space": "up_${layer_index}", + "output_space": "running_residual" + } + ] + }, + "post_weights": [ + { + "name": "model.norm.weight", + "input_space": "running_residual" + }, + { + "name": "lm_head.weight", + "input_space": "running_residual", + "is_embed": true, + "aliases": [ + "model.lm_head.weight" + ] + } + ] +} diff --git a/mergekit/_data/chat_templates/exaone.jinja b/mergekit/_data/chat_templates/exaone.jinja new file mode 100644 index 00000000..3a4d07ae --- /dev/null +++ b/mergekit/_data/chat_templates/exaone.jinja @@ -0,0 +1,14 @@ +{% for message in messages %} + {% if loop.first and message['role'] != 'system' %} + {{ '[|system|][|endofturn|]\n' }} + {% endif %} + {{ '[|' + message['role'] + '|]' + message['content'] }} + {% if message['role'] == 'user' %} + {{ '\n' }} + {% else %} + {{ '[|endofturn|]\n' }} + {% endif %} +{% endfor %} +{% if add_generation_prompt %} + {{ '[|assistant|]' }} +{% endif %}