From 3c3f1cc14d1bb1daa69f750a99e8f2b22fa9abf7 Mon Sep 17 00:00:00 2001
From: zhenwei-intel <zhenwei.liu@intel.com>
Date: Sun, 4 Feb 2024 14:07:14 +0800
Subject: [PATCH] update

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 neural_speed/convert/common.py                  | 6 +++---
 neural_speed/convert/convert_quantized_gptj.py  | 2 +-
 neural_speed/convert/convert_quantized_llama.py | 2 +-
 scripts/convert.py                              | 3 ++-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
index 8c74426ef..3054b43e7 100644
--- a/neural_speed/convert/common.py
+++ b/neural_speed/convert/common.py
@@ -354,7 +354,7 @@ def convert_q4_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2
     # gptq_scale = torch.cat([gptq_scale,gptq_scale,gptq_scale,gptq_scale], dim=1).view(-1,1)
     pack_tensor = torch.cat((gptq_scale.half().view(torch.int8), tensor), dim=-1)
     pack_tensor.numpy().tofile(fout)
-    print(f"converting {dst_name} qauntized tensor to ggml q4 block")
+    print(f"converting {dst_name} quantized tensor to ggml q4 block")
 
 def convert_q4_1_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2=0, permute_func=None):
     qzeros = model[f"{src_name}.qzeros"]
@@ -381,7 +381,7 @@ def convert_q4_1_tensor(src_name, dst_name, model, fout, q_config, n_head, n_hea
     gptq_zeros = -gptq_scale*gptq_zeros
     pack_tensor = torch.cat((gptq_scale.half().view(torch.int8), gptq_zeros.half().view(torch.int8), tensor), dim=-1)
     pack_tensor.numpy().tofile(fout)
-    print(f"converting {dst_name} qauntized tensor to ggml q4 1 block")
+    print(f"converting {dst_name} quantized tensor to ggml q4 1 block")
 
 
 def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head_kv=0, permute_func=None):
@@ -411,4 +411,4 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h
     write_header(fout, shape, dst_name, 0)
     weight.numpy().tofile(fout)
 
-    print(f"converting {dst_name} qauntized tensor to fp32 tensor")
+    print(f"converting {dst_name} quantized tensor to fp32 tensor")
diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py
index 829445707..cac8c346a 100644
--- a/neural_speed/convert/convert_quantized_gptj.py
+++ b/neural_speed/convert/convert_quantized_gptj.py
@@ -96,7 +96,7 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
                                             alg="sym" if q_config['sym'] else "asym",
                                             compute_dtype="int8")
     dst.flatten()[:byte_size].tofile(fout)
-    print(f"converting {dst_name} qauntized tensor to bestla q4 block")
+    print(f"converting {dst_name} quantized tensor to bestla q4 block")
 
 
 def main(args_in: Optional[List[str]] = None) -> None:    
diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
index 4733d6e7f..e00503dba 100644
--- a/neural_speed/convert/convert_quantized_llama.py
+++ b/neural_speed/convert/convert_quantized_llama.py
@@ -92,7 +92,7 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
                                                alg="sym" if q_config['sym'] else "asym",
                                                compute_dtype="int8")
     dst.flatten()[:byte_size].tofile(fout)
-    print(f"converting {dst_name} qauntized tensor to bestla q4 block")
+    print(f"converting {dst_name} quantized tensor to bestla q4 block")
 
 def main(args_in: Optional[List[str]] = None) -> None:    
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
diff --git a/scripts/convert.py b/scripts/convert.py
index d0012d51a..a5694ccec 100644
--- a/scripts/convert.py
+++ b/scripts/convert.py
@@ -27,6 +27,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     )
     parser.add_argument("--outfile", type=Path, required=True, help="path to write to")
     parser.add_argument("model", type=Path, help="directory containing model file or model id")
+    parser.add_argument("--use_quantized_model", action="store_true", help="use quantized model: awq/gptq/autoround")
     args = parser.parse_args(args_in)
 
     if args.model.exists():
@@ -34,7 +35,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     else:
         dir_model = args.model
 
-    convert_model(dir_model, args.outfile, args.outtype)
+    convert_model(dir_model, args.outfile, args.outtype, use_quantized_model=args.use_quantized_model)
 
 
 if __name__ == "__main__":