diff --git a/docs/supported_models.md b/docs/supported_models.md
index 8b0559b0e..bf97dc628 100644
--- a/docs/supported_models.md
+++ b/docs/supported_models.md
@@ -43,6 +43,15 @@ Neural Speed supports the following models:
✅ |
✅ |
Latest |
+
+ Solar-10.7B |
+ ✅ |
+ ✅ |
+ ✅ |
+ ✅ |
+ ✅ |
+ ✅ |
+ Latest |
GPT-J-6B |
@@ -284,6 +293,15 @@ Neural Speed supports the following models:
✅ |
|
+
+ TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF |
+ ✅ |
+ ✅ |
+ ✅ |
+ ✅ |
+ |
+
+
TheBloke/CodeLlama-7B-GGUF |
✅ |
@@ -318,7 +336,14 @@ Neural Speed supports the following models:
✅ |
✅ |
-
+
+ upstage/SOLAR-10.7B-Instruct-v1.0 |
+ ✅ |
+ ✅ |
+ ✅ |
+ ✅ |
+ ✅ |
+
tiiuae/falcon-7 |
✅ |
diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
index b4e49e1cb..d0fe62285 100644
--- a/neural_speed/convert/common.py
+++ b/neural_speed/convert/common.py
@@ -394,6 +394,7 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h
# num_itr = g_idx.shape[0]//x.shape[-1]
if 'desc_act' in q_config and q_config['desc_act']:
g_idx = model[f"{src_name}.g_idx"]
+ weight = weight.reshape(-1, weight.shape[-1])
weight = (gptq_scales[g_idx.long()] * (weight - gptq_zeros[g_idx.long()]))
else:
infeatures = weight.shape[0]
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index eeeb02193..9dae31bd8 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -155,6 +155,7 @@ class Params:
rope_scale: float
bos_token_id: int
eos_token_id: int
+ pad_token_id: int
@staticmethod
def guessed(model: 'LazyModel') -> 'Params':
@@ -188,6 +189,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
bos_token_id = config["bos_token_id"]
eos_token_id = config["eos_token_id"]
+ pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1
return Params(
n_vocab=n_vocab,
@@ -202,6 +204,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
rope_scale=rope_scale,
bos_token_id = bos_token_id,
eos_token_id = eos_token_id,
+ pad_token_id = pad_token_id,
)
# LLaMA v2 70B params.json
@@ -219,6 +222,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
ffn_hidden_size = config["intermediate_size"]
bos_token_id = config["bos_token_id"]
eos_token_id = config["eos_token_id"]
+ pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1
# hack to determine LLaMA v1 vs v2 vs CodeLlama
if n_vocab == -1:
@@ -234,6 +238,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
ffn_hidden_size=ffn_hidden_size,
bos_token_id = bos_token_id,
eos_token_id = eos_token_id,
+ pad_token_id = pad_token_id,
)
@staticmethod
@@ -332,7 +337,7 @@ def __repr__(self) -> str:
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
if n_head_kv is not None and n_head != n_head_kv:
- n_head //= n_head_kv
+ n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2,
*weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape))
@@ -1092,7 +1097,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
# but bos_token_id = 1 in llama.cpp
self.fout.write(struct.pack("i", params.bos_token_id))
self.fout.write(struct.pack("i", params.eos_token_id))
- self.fout.write(struct.pack("i", -1))
+ self.fout.write(struct.pack("i", params.pad_token_id))
self.fout.write(struct.pack("i", -1))
def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
index 6d001ba39..4733d6e7f 100644
--- a/neural_speed/convert/convert_quantized_llama.py
+++ b/neural_speed/convert/convert_quantized_llama.py
@@ -23,7 +23,7 @@
def permute_func(weights, n_head: int, n_head_kv: int):
if n_head_kv is not None and n_head != n_head_kv:
- n_head //= n_head_kv
+ n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2,
*weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape))
@@ -40,12 +40,6 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
int_weight = int_weight.view(-1,int_weight.shape[-1])
- # permute_func for llama-like model
- if permute_func:
- int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous()
- gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous()
- gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous()
-
# shuffle weight in GPTQ when act order is on
if 'desc_act'in q_config and q_config['desc_act']:
g_idx = model[f"{src_name}.g_idx"]
@@ -63,6 +57,12 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
int_weight2[target_idx] = int_weight[i]
int_weight = int_weight2
+ # permute_func for llama-like model
+ if permute_func:
+ int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous()
+ gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous()
+ gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous()
+
shape = int_weight.shape
write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE)
@@ -123,7 +123,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
if "rope_scaling" in config and config["rope_scaling"] is not None:
rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
n_head = n_head
- n_head_kv = n_head
+ n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
values = [
1, # file version
n_vocab,