From 90f5cbded3f56e1852fad1a4cca8a959a9545b31 Mon Sep 17 00:00:00 2001
From: "Dong, Bo" <bo1.dong@intel.com>
Date: Fri, 2 Feb 2024 11:51:54 +0800
Subject: [PATCH] Support gptq with solar (#106)

Co-authored-by: intellinjun <105184542+intellinjun@users.noreply.github.com>
---
 docs/supported_models.md                      | 27 ++++++++++++++++++-
 neural_speed/convert/common.py                |  1 +
 neural_speed/convert/convert_llama.py         |  9 +++++--
 .../convert/convert_quantized_llama.py        | 16 +++++------
 4 files changed, 42 insertions(+), 11 deletions(-)
diff --git a/docs/supported_models.md b/docs/supported_models.md
index 8b0559b0e..bf97dc628 100644
--- a/docs/supported_models.md
+++ b/docs/supported_models.md
@@ -43,6 +43,15 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+  </tr>
+    <td><a href="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0" target="_blank" rel="noopener noreferrer">Solar-10.7B</a></td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>Latest</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/EleutherAI/gpt-j-6b" target="_blank" rel="noopener noreferrer">GPT-J-6B</a></td>
@@ -284,6 +293,15 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td></td>
   </tr>
+  <tr>
+    <td><a href="https://huggingface.co/TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF" target="_blank" rel="noopener noreferrer">TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td></td>
+  </tr>
+    </tr>
     <tr>
     <td><a href="https://huggingface.co/codellama/CodeLlama-7b-hf" target="_blank" rel="noopener noreferrer">TheBloke/CodeLlama-7B-GGUF</a></td>
     <td>✅</td>
@@ -318,7 +336,14 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
   </tr>
-    </tr>
+   <tr>
+    <td><a href="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0" target="_blank" rel="noopener noreferrer">upstage/SOLAR-10.7B-Instruct-v1.0</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+  </tr>
     <tr>
     <td><a href="https://huggingface.co/tiiuae/falcon-7b/tree/main" target="_blank" rel="noopener noreferrer">tiiuae/falcon-7</td>
     <td>✅</td>
diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
index b4e49e1cb..d0fe62285 100644
--- a/neural_speed/convert/common.py
+++ b/neural_speed/convert/common.py
@@ -394,6 +394,7 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h
     # num_itr = g_idx.shape[0]//x.shape[-1]
     if 'desc_act' in q_config and q_config['desc_act']:
         g_idx = model[f"{src_name}.g_idx"]
+        weight = weight.reshape(-1, weight.shape[-1])
         weight = (gptq_scales[g_idx.long()] * (weight - gptq_zeros[g_idx.long()]))
     else:
         infeatures = weight.shape[0]
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index eeeb02193..9dae31bd8 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -155,6 +155,7 @@ class Params:
     rope_scale: float
     bos_token_id: int
     eos_token_id: int
+    pad_token_id: int
 
     @staticmethod
     def guessed(model: 'LazyModel') -> 'Params':
@@ -188,6 +189,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
             rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
         bos_token_id = config["bos_token_id"]
         eos_token_id = config["eos_token_id"]
+        pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1
 
         return Params(
             n_vocab=n_vocab,
@@ -202,6 +204,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
             rope_scale=rope_scale,
             bos_token_id = bos_token_id,
             eos_token_id = eos_token_id,
+            pad_token_id = pad_token_id,
         )
 
     # LLaMA v2 70B params.json
@@ -219,6 +222,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
         ffn_hidden_size = config["intermediate_size"]
         bos_token_id = config["bos_token_id"]
         eos_token_id = config["eos_token_id"]
+        pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1
         # hack to determine LLaMA v1 vs v2 vs CodeLlama
 
         if n_vocab == -1:
@@ -234,6 +238,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
             ffn_hidden_size=ffn_hidden_size,
             bos_token_id = bos_token_id,
             eos_token_id = eos_token_id,
+            pad_token_id = pad_token_id,
         )
 
     @staticmethod
@@ -332,7 +337,7 @@ def __repr__(self) -> str:
 
 def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
     if n_head_kv is not None and n_head != n_head_kv:
-        n_head //= n_head_kv
+        n_head = n_head_kv
     return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2,
                             *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape))
 
@@ -1092,7 +1097,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
         # but bos_token_id = 1 in llama.cpp
         self.fout.write(struct.pack("i", params.bos_token_id))
         self.fout.write(struct.pack("i", params.eos_token_id))
-        self.fout.write(struct.pack("i", -1))
+        self.fout.write(struct.pack("i", params.pad_token_id))
         self.fout.write(struct.pack("i", -1))
 
     def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
index 6d001ba39..4733d6e7f 100644
--- a/neural_speed/convert/convert_quantized_llama.py
+++ b/neural_speed/convert/convert_quantized_llama.py
@@ -23,7 +23,7 @@
 
 def permute_func(weights, n_head: int, n_head_kv: int):
     if n_head_kv is not None and n_head != n_head_kv:
-        n_head //= n_head_kv
+        n_head = n_head_kv
     return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2,
                             *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape))
 
@@ -40,12 +40,6 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
     int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
     int_weight = int_weight.view(-1,int_weight.shape[-1])
 
-    # permute_func for llama-like model
-    if permute_func:
-        int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous()
-        gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous()
-        gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous()
-
     # shuffle weight in GPTQ when act order is on
     if 'desc_act'in q_config and q_config['desc_act']:
         g_idx = model[f"{src_name}.g_idx"]
@@ -63,6 +57,12 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
             int_weight2[target_idx] = int_weight[i]
         int_weight = int_weight2
 
+    # permute_func for llama-like model
+    if permute_func:
+        int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous()
+        gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous()
+        gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous()
+
     shape = int_weight.shape
     write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE)
 
@@ -123,7 +123,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     if "rope_scaling" in config and config["rope_scaling"] is not None:
         rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
     n_head = n_head
-    n_head_kv = n_head
+    n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
     values = [
         1,  # file version
         n_vocab,