update

Signed-off-by: zhenwei-intel <[email protected]>
intel · Feb 4, 2024 · 3c3f1cc · 3c3f1cc
1 parent cf5e06e
commit 3c3f1cc
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 6 deletions.
diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
@@ -354,7 +354,7 @@ def convert_q4_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2
     # gptq_scale = torch.cat([gptq_scale,gptq_scale,gptq_scale,gptq_scale], dim=1).view(-1,1)
     pack_tensor = torch.cat((gptq_scale.half().view(torch.int8), tensor), dim=-1)
     pack_tensor.numpy().tofile(fout)
-    print(f"converting {dst_name} qauntized tensor to ggml q4 block")
+    print(f"converting {dst_name} quantized tensor to ggml q4 block")
 
 def convert_q4_1_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head2=0, permute_func=None):
     qzeros = model[f"{src_name}.qzeros"]
@@ -381,7 +381,7 @@ def convert_q4_1_tensor(src_name, dst_name, model, fout, q_config, n_head, n_hea
     gptq_zeros = -gptq_scale*gptq_zeros
     pack_tensor = torch.cat((gptq_scale.half().view(torch.int8), gptq_zeros.half().view(torch.int8), tensor), dim=-1)
     pack_tensor.numpy().tofile(fout)
-    print(f"converting {dst_name} qauntized tensor to ggml q4 1 block")
+    print(f"converting {dst_name} quantized tensor to ggml q4 1 block")
 
 
 def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_head_kv=0, permute_func=None):
@@ -411,4 +411,4 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h
     write_header(fout, shape, dst_name, 0)
     weight.numpy().tofile(fout)
 
-    print(f"converting {dst_name} qauntized tensor to fp32 tensor")
+    print(f"converting {dst_name} quantized tensor to fp32 tensor")
diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py
@@ -96,7 +96,7 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
                                             alg="sym" if q_config['sym'] else "asym",
                                             compute_dtype="int8")
     dst.flatten()[:byte_size].tofile(fout)
-    print(f"converting {dst_name} qauntized tensor to bestla q4 block")
+    print(f"converting {dst_name} quantized tensor to bestla q4 block")
 
 
 def main(args_in: Optional[List[str]] = None) -> None:    

diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
@@ -92,7 +92,7 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
                                                alg="sym" if q_config['sym'] else "asym",
                                                compute_dtype="int8")
     dst.flatten()[:byte_size].tofile(fout)
-    print(f"converting {dst_name} qauntized tensor to bestla q4 block")
+    print(f"converting {dst_name} quantized tensor to bestla q4 block")
 
 def main(args_in: Optional[List[str]] = None) -> None:    
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")

diff --git a/scripts/convert.py b/scripts/convert.py
@@ -27,14 +27,15 @@ def main(args_in: Optional[List[str]] = None) -> None:
     )
     parser.add_argument("--outfile", type=Path, required=True, help="path to write to")
     parser.add_argument("model", type=Path, help="directory containing model file or model id")
+    parser.add_argument("--use_quantized_model", action="store_true", help="use quantized model: awq/gptq/autoround")
     args = parser.parse_args(args_in)
 
     if args.model.exists():
         dir_model = args.model.as_posix()
     else:
         dir_model = args.model
 
-    convert_model(dir_model, args.outfile, args.outtype)
+    convert_model(dir_model, args.outfile, args.outtype, use_quantized_model=args.use_quantized_model)
 
 
 if __name__ == "__main__":