From 3c3f1cc14d1bb1daa69f750a99e8f2b22fa9abf7 Mon Sep 17 00:00:00 2001 From: zhenwei-intel Date: Sun, 4 Feb 2024 14:07:14 +0800 Subject: [PATCH] update Signed-off-by: zhenwei-intel --- neural_speed/convert/common.py | 6 +++--- neural_speed/convert/convert_quantized_gptj.py | 2 +- neural_speed/convert/convert_quantized_llama.py | 2 +- scripts/convert.py | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py index 8c74426ef..3054b43e7 100644 --- a/neural_speed/convert/common.py +++ b/neural_speed/convert/common.py @@ -354,7 +354,7 @@ def convert_q4_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2 # gptq_scale = torch.cat([gptq_scale,gptq_scale,gptq_scale,gptq_scale], dim=1).view(-1,1) pack_tensor = torch.cat((gptq_scale.half().view(torch.int8), tensor), dim=-1) pack_tensor.numpy().tofile(fout) - print(f"converting {dst_name} qauntized tensor to ggml q4 block") + print(f"converting {dst_name} quantized tensor to ggml q4 block") def convert_q4_1_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2=0, permute_func=None): qzeros = model[f"{src_name}.qzeros"] @@ -381,7 +381,7 @@ def convert_q4_1_tensor(src_name, dst_name, model, fout, q_config, n_head, n_hea gptq_zeros = -gptq_scale*gptq_zeros pack_tensor = torch.cat((gptq_scale.half().view(torch.int8), gptq_zeros.half().view(torch.int8), tensor), dim=-1) pack_tensor.numpy().tofile(fout) - print(f"converting {dst_name} qauntized tensor to ggml q4 1 block") + print(f"converting {dst_name} quantized tensor to ggml q4 1 block") def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head_kv=0, permute_func=None): @@ -411,4 +411,4 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h write_header(fout, shape, dst_name, 0) weight.numpy().tofile(fout) - print(f"converting {dst_name} qauntized tensor to fp32 tensor") + print(f"converting {dst_name} quantized tensor to fp32 tensor") diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py index 829445707..cac8c346a 100644 --- a/neural_speed/convert/convert_quantized_gptj.py +++ b/neural_speed/convert/convert_quantized_gptj.py @@ -96,7 +96,7 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config): alg="sym" if q_config['sym'] else "asym", compute_dtype="int8") dst.flatten()[:byte_size].tofile(fout) - print(f"converting {dst_name} qauntized tensor to bestla q4 block") + print(f"converting {dst_name} quantized tensor to bestla q4 block") def main(args_in: Optional[List[str]] = None) -> None: diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py index 4733d6e7f..e00503dba 100644 --- a/neural_speed/convert/convert_quantized_llama.py +++ b/neural_speed/convert/convert_quantized_llama.py @@ -92,7 +92,7 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head, alg="sym" if q_config['sym'] else "asym", compute_dtype="int8") dst.flatten()[:byte_size].tofile(fout) - print(f"converting {dst_name} qauntized tensor to bestla q4 block") + print(f"converting {dst_name} quantized tensor to bestla q4 block") def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") diff --git a/scripts/convert.py b/scripts/convert.py index d0012d51a..a5694ccec 100644 --- a/scripts/convert.py +++ b/scripts/convert.py @@ -27,6 +27,7 @@ def main(args_in: Optional[List[str]] = None) -> None: ) parser.add_argument("--outfile", type=Path, required=True, help="path to write to") parser.add_argument("model", type=Path, help="directory containing model file or model id") + parser.add_argument("--use_quantized_model", action="store_true", help="use quantized model: awq/gptq/autoround") args = parser.parse_args(args_in) if args.model.exists(): @@ -34,7 +35,7 @@ def main(args_in: Optional[List[str]] = None) -> None: else: dir_model = args.model - convert_model(dir_model, args.outfile, args.outtype) + convert_model(dir_model, args.outfile, args.outtype, use_quantized_model=args.use_quantized_model) if __name__ == "__main__":