diff --git a/.gitignore b/.gitignore index cf1b692e9c27c..59839cc8b386e 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,7 @@ build*/ out/ tmp/ +local/ models/* models-mnt diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 203eaf64b3fc3..32cad0987c80d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -10,7 +10,7 @@ import sys from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Optional, cast import numpy as np import torch @@ -22,9 +22,9 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf - ###### MODEL DEFINITIONS ###### + class SentencePieceTokenTypes(IntEnum): NORMAL = 1 UNKNOWN = 2 @@ -185,7 +185,7 @@ def from_model_architecture(model_architecture): if model_architecture == "GPT2LMHeadModel": return GPT2Model if model_architecture == "PhiForCausalLM": - return Phi2Model + return PhiModel if model_architecture == "PlamoForCausalLM": return PlamoModel return Model @@ -230,7 +230,7 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: if arch == "GPT2LMHeadModel": return gguf.MODEL_ARCH.GPT2 if arch == "PhiForCausalLM": - return gguf.MODEL_ARCH.PHI2 + return gguf.MODEL_ARCH.PHI if arch == "PlamoForCausalLM": return gguf.MODEL_ARCH.PLAMO @@ -1059,11 +1059,11 @@ def write_tensors(self): self.gguf_writer.add_tensor("output.weight", data) -class Phi2Model(Model): +class PhiModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("Phi2") + self.gguf_writer.add_name("Phi") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f0a1c51f8dbe8..751aff63b4b3c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -97,7 +97,7 @@ class MODEL_ARCH(IntEnum): BLOOM = auto() STABLELM = auto() QWEN = auto() - PHI2 = auto() + PHI = auto() PLAMO = auto() @@ -145,7 +145,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PHI: "phi", MODEL_ARCH.PLAMO: "plamo", } @@ -383,7 +383,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], - MODEL_ARCH.PHI2: [ + MODEL_ARCH.PHI: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 80c1d5449cc74..22df04b987a79 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -17,8 +17,8 @@ class TensorNameMap: "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert "language_model.embedding.word_embeddings", # persimmon + "transformer.embd.wte", # phi1 phi1_5 phi2 "wte", # gpt2 - "transformer.embd.wte", # phi2 ), # Token type embeddings @@ -44,7 +44,7 @@ class TensorNameMap: "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen "output", # llama-pth bloom "word_embeddings_for_head", # persimmon - "lm_head.linear", # phi2 + "lm_head.linear", # phi1 phi1_5 phi2 ), # Output norm @@ -57,7 +57,7 @@ class TensorNameMap: "transformer.norm_f", # mpt "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon - "lm_head.ln", # phi2 + "lm_head.ln", # phi1 phi1_5 phi2 ), # Rope frequencies @@ -80,8 +80,8 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.LayerNorm", # bert "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi + "transformer.h.{bid}.ln", # phi1 phi1_5 phi2 "h.{bid}.ln_1", # gpt2 - "transformer.h.{bid}.ln", # phi2 "model.layers.layers.{bid}.norm", # plamo ), @@ -98,8 +98,8 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon + "transformer.h.{bid}.mixer.Wqkv", # phi1 phi1_5 phi2 "h.{bid}.attn.c_attn", # gpt2 - "transformer.h.{bid}.mixer.Wqkv", # phi2 ), # Attention query @@ -141,8 +141,8 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "transformer.h.{bid}.mixer.out_proj", # phi1 phi1_5 phi2 "h.{bid}.attn.c_proj", # gpt2 - "transformer.h.{bid}.mixer.out_proj", # phi2 "model.layers.layers.{bid}.self_attn.o_proj", # plamo ), @@ -185,8 +185,8 @@ class TensorNameMap: "transformer.h.{bid}.mlp.fc_in", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon "transformer.h.{bid}.mlp.w1", # qwen + "transformer.h.{bid}.mlp.fc1", # phi1 phi1_5 phi2 "h.{bid}.mlp.c_fc", # gpt2 - "transformer.h.{bid}.mlp.fc1", # phi2 "model.layers.layers.{bid}.mlp.up_proj", # plamo ), @@ -225,8 +225,8 @@ class TensorNameMap: "encoder.layer.{bid}.output.dense", # bert "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "transformer.h.{bid}.mlp.fc2", # phi1 phi1_5 phi2 "h.{bid}.mlp.c_proj", # gpt2 - "transformer.h.{bid}.mlp.fc2", # phi2 "model.layers.layers.{bid}.mlp.down_proj", # plamo ), diff --git a/llama.cpp b/llama.cpp index 8e0717db92702..7874030cef962 100644 --- a/llama.cpp +++ b/llama.cpp @@ -197,7 +197,7 @@ enum llm_arch { LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, - LLM_ARCH_PHI2, + LLM_ARCH_PHI, LLM_ARCH_PLAMO, LLM_ARCH_UNKNOWN, }; @@ -216,7 +216,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, - { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PHI, "phi" }, { LLM_ARCH_PLAMO, "plamo" }, }; @@ -572,7 +572,7 @@ static std::map> LLM_TENSOR_NAMES = }, }, { - LLM_ARCH_PHI2, + LLM_ARCH_PHI, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, @@ -2824,11 +2824,12 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; - case LLM_ARCH_PHI2: + case LLM_ARCH_PHI: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; default: model.type = e_model::MODEL_UNKNOWN; } @@ -3715,7 +3716,7 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); } } break; - case LLM_ARCH_PHI2: + case LLM_ARCH_PHI: { model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); @@ -4329,7 +4330,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); - if (model.arch == LLM_ARCH_PHI2) { + if (model.arch == LLM_ARCH_PHI) { // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 ggml_mul_mat_set_prec(kq, GGML_PREC_F32); @@ -5741,7 +5742,7 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_phi2() { + struct ggml_cgraph * build_phi() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; @@ -6570,9 +6571,9 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_qwen(); } break; - case LLM_ARCH_PHI2: + case LLM_ARCH_PHI: { - result = llm.build_phi2(); + result = llm.build_phi(); } break; case LLM_ARCH_PLAMO: {