Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Support gptq with solar (#106)
Browse files Browse the repository at this point in the history
Co-authored-by: intellinjun <[email protected]>
  • Loading branch information
a32543254 and intellinjun authored Feb 2, 2024
1 parent 6f85518 commit 90f5cbd
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 11 deletions.
27 changes: 26 additions & 1 deletion docs/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>Latest</td>
</tr>
<td><a href="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0" target="_blank" rel="noopener noreferrer">Solar-10.7B</a></td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>Latest</td>
</tr>
<tr>
<td><a href="https://huggingface.co/EleutherAI/gpt-j-6b" target="_blank" rel="noopener noreferrer">GPT-J-6B</a></td>
Expand Down Expand Up @@ -284,6 +293,15 @@ Neural Speed supports the following models:
<td>✅</td>
<td></td>
</tr>
<tr>
<td><a href="https://huggingface.co/TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF" target="_blank" rel="noopener noreferrer">TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td></td>
</tr>
</tr>
<tr>
<td><a href="https://huggingface.co/codellama/CodeLlama-7b-hf" target="_blank" rel="noopener noreferrer">TheBloke/CodeLlama-7B-GGUF</a></td>
<td>✅</td>
Expand Down Expand Up @@ -318,7 +336,14 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
</tr>
</tr>
<tr>
<td><a href="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0" target="_blank" rel="noopener noreferrer">upstage/SOLAR-10.7B-Instruct-v1.0</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
</tr>
<tr>
<td><a href="https://huggingface.co/tiiuae/falcon-7b/tree/main" target="_blank" rel="noopener noreferrer">tiiuae/falcon-7</td>
<td>✅</td>
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h
# num_itr = g_idx.shape[0]//x.shape[-1]
if 'desc_act' in q_config and q_config['desc_act']:
g_idx = model[f"{src_name}.g_idx"]
weight = weight.reshape(-1, weight.shape[-1])
weight = (gptq_scales[g_idx.long()] * (weight - gptq_zeros[g_idx.long()]))
else:
infeatures = weight.shape[0]
Expand Down
9 changes: 7 additions & 2 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ class Params:
rope_scale: float
bos_token_id: int
eos_token_id: int
pad_token_id: int

@staticmethod
def guessed(model: 'LazyModel') -> 'Params':
Expand Down Expand Up @@ -188,6 +189,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
bos_token_id = config["bos_token_id"]
eos_token_id = config["eos_token_id"]
pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1

return Params(
n_vocab=n_vocab,
Expand All @@ -202,6 +204,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
rope_scale=rope_scale,
bos_token_id = bos_token_id,
eos_token_id = eos_token_id,
pad_token_id = pad_token_id,
)

# LLaMA v2 70B params.json
Expand All @@ -219,6 +222,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
ffn_hidden_size = config["intermediate_size"]
bos_token_id = config["bos_token_id"]
eos_token_id = config["eos_token_id"]
pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1
# hack to determine LLaMA v1 vs v2 vs CodeLlama

if n_vocab == -1:
Expand All @@ -234,6 +238,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
ffn_hidden_size=ffn_hidden_size,
bos_token_id = bos_token_id,
eos_token_id = eos_token_id,
pad_token_id = pad_token_id,
)

@staticmethod
Expand Down Expand Up @@ -332,7 +337,7 @@ def __repr__(self) -> str:

def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
if n_head_kv is not None and n_head != n_head_kv:
n_head //= n_head_kv
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2,
*weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape))

Expand Down Expand Up @@ -1092,7 +1097,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
# but bos_token_id = 1 in llama.cpp
self.fout.write(struct.pack("i", params.bos_token_id))
self.fout.write(struct.pack("i", params.eos_token_id))
self.fout.write(struct.pack("i", -1))
self.fout.write(struct.pack("i", params.pad_token_id))
self.fout.write(struct.pack("i", -1))

def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
Expand Down
16 changes: 8 additions & 8 deletions neural_speed/convert/convert_quantized_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

def permute_func(weights, n_head: int, n_head_kv: int):
if n_head_kv is not None and n_head != n_head_kv:
n_head //= n_head_kv
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2,
*weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape))

Expand All @@ -40,12 +40,6 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
int_weight = int_weight.view(-1,int_weight.shape[-1])

# permute_func for llama-like model
if permute_func:
int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous()
gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous()
gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous()

# shuffle weight in GPTQ when act order is on
if 'desc_act'in q_config and q_config['desc_act']:
g_idx = model[f"{src_name}.g_idx"]
Expand All @@ -63,6 +57,12 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
int_weight2[target_idx] = int_weight[i]
int_weight = int_weight2

# permute_func for llama-like model
if permute_func:
int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous()
gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous()
gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous()

shape = int_weight.shape
write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE)

Expand Down Expand Up @@ -123,7 +123,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
if "rope_scaling" in config and config["rope_scaling"] is not None:
rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
n_head = n_head
n_head_kv = n_head
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
values = [
1, # file version
n_vocab,
Expand Down

0 comments on commit 90f5cbd

Please sign in to comment.