From 90f5cbded3f56e1852fad1a4cca8a959a9545b31 Mon Sep 17 00:00:00 2001 From: "Dong, Bo" Date: Fri, 2 Feb 2024 11:51:54 +0800 Subject: [PATCH] Support gptq with solar (#106) Co-authored-by: intellinjun <105184542+intellinjun@users.noreply.github.com> --- docs/supported_models.md | 27 ++++++++++++++++++- neural_speed/convert/common.py | 1 + neural_speed/convert/convert_llama.py | 9 +++++-- .../convert/convert_quantized_llama.py | 16 +++++------ 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/docs/supported_models.md b/docs/supported_models.md index 8b0559b0e..bf97dc628 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -43,6 +43,15 @@ Neural Speed supports the following models: ✅ ✅ Latest + + Solar-10.7B + ✅ + ✅ + ✅ + ✅ + ✅ + ✅ + Latest GPT-J-6B @@ -284,6 +293,15 @@ Neural Speed supports the following models: ✅ + + TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF + ✅ + ✅ + ✅ + ✅ + + + TheBloke/CodeLlama-7B-GGUF ✅ @@ -318,7 +336,14 @@ Neural Speed supports the following models: ✅ ✅ - + + upstage/SOLAR-10.7B-Instruct-v1.0 + ✅ + ✅ + ✅ + ✅ + ✅ + tiiuae/falcon-7 ✅ diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py index b4e49e1cb..d0fe62285 100644 --- a/neural_speed/convert/common.py +++ b/neural_speed/convert/common.py @@ -394,6 +394,7 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h # num_itr = g_idx.shape[0]//x.shape[-1] if 'desc_act' in q_config and q_config['desc_act']: g_idx = model[f"{src_name}.g_idx"] + weight = weight.reshape(-1, weight.shape[-1]) weight = (gptq_scales[g_idx.long()] * (weight - gptq_zeros[g_idx.long()])) else: infeatures = weight.shape[0] diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index eeeb02193..9dae31bd8 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -155,6 +155,7 @@ class Params: rope_scale: float bos_token_id: int eos_token_id: int + pad_token_id: int @staticmethod def guessed(model: 'LazyModel') -> 'Params': @@ -188,6 +189,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params': rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1 bos_token_id = config["bos_token_id"] eos_token_id = config["eos_token_id"] + pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1 return Params( n_vocab=n_vocab, @@ -202,6 +204,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params': rope_scale=rope_scale, bos_token_id = bos_token_id, eos_token_id = eos_token_id, + pad_token_id = pad_token_id, ) # LLaMA v2 70B params.json @@ -219,6 +222,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params': ffn_hidden_size = config["intermediate_size"] bos_token_id = config["bos_token_id"] eos_token_id = config["eos_token_id"] + pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1 # hack to determine LLaMA v1 vs v2 vs CodeLlama if n_vocab == -1: @@ -234,6 +238,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params': ffn_hidden_size=ffn_hidden_size, bos_token_id = bos_token_id, eos_token_id = eos_token_id, + pad_token_id = pad_token_id, ) @staticmethod @@ -332,7 +337,7 @@ def __repr__(self) -> str: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: if n_head_kv is not None and n_head != n_head_kv: - n_head //= n_head_kv + n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape)) @@ -1092,7 +1097,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: # but bos_token_id = 1 in llama.cpp self.fout.write(struct.pack("i", params.bos_token_id)) self.fout.write(struct.pack("i", params.eos_token_id)) - self.fout.write(struct.pack("i", -1)) + self.fout.write(struct.pack("i", params.pad_token_id)) self.fout.write(struct.pack("i", -1)) def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None: diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py index 6d001ba39..4733d6e7f 100644 --- a/neural_speed/convert/convert_quantized_llama.py +++ b/neural_speed/convert/convert_quantized_llama.py @@ -23,7 +23,7 @@ def permute_func(weights, n_head: int, n_head_kv: int): if n_head_kv is not None and n_head != n_head_kv: - n_head //= n_head_kv + n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape)) @@ -40,12 +40,6 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head, int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config) int_weight = int_weight.view(-1,int_weight.shape[-1]) - # permute_func for llama-like model - if permute_func: - int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous() - gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous() - gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous() - # shuffle weight in GPTQ when act order is on if 'desc_act'in q_config and q_config['desc_act']: g_idx = model[f"{src_name}.g_idx"] @@ -63,6 +57,12 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head, int_weight2[target_idx] = int_weight[i] int_weight = int_weight2 + # permute_func for llama-like model + if permute_func: + int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous() + gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous() + gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous() + shape = int_weight.shape write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE) @@ -123,7 +123,7 @@ def main(args_in: Optional[List[str]] = None) -> None: if "rope_scaling" in config and config["rope_scaling"] is not None: rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1 n_head = n_head - n_head_kv = n_head + n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head values = [ 1, # file version n_vocab,