Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Enable tiny_llama (#270)
Browse files Browse the repository at this point in the history
* enable tiny_llama

Signed-off-by: intellinjun <[email protected]>

* Update convert_llama.py

* Update llama.h

* Update convert_llama.py

* update ci

Signed-off-by: intellinjun <[email protected]>

* update ci

Signed-off-by: intellinjun <[email protected]>

---------

Signed-off-by: intellinjun <[email protected]>
  • Loading branch information
intellinjun authored May 30, 2024
1 parent cadb01e commit af22f2a
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 5 deletions.
3 changes: 2 additions & 1 deletion docs/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ Neural Speed supports the following models:
<td>8192</td>
</tr>
<tr>
<td><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-7B</a>,
<td><a href="https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0" target="_blank" rel="noopener noreferrer">TinyLlama-1.1B</a>,
<a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-tB</a>,
<a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-13B</a>,
<a href="https://huggingface.co/meta-llama/Llama-2-70b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-70B</a></td>
<td>✅</td>
Expand Down
1 change: 1 addition & 0 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def init(self,

def init_from_bin(self, model_type, model_path, **generate_kwargs):
if self.module is None:
model_type = model_maps.get(model_type, model_type)
self.module = _import_package(model_type)
self.model = self.module.Model()
if model_type=="whisper":
Expand Down
10 changes: 8 additions & 2 deletions neural_speed/convert/convert_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", ftype))
fout.write(struct.pack("i", hparams["model_max_length"]))
if "max_position_embeddings" in hparams:
fout.write(struct.pack("i", hparams["max_position_embeddings"]))
else:
fout.write(struct.pack("i", hparams["model_max_length"]))
fout.write(struct.pack("f", 0))
fout.write(struct.pack("f", 0))
fout.write(struct.pack("i", 0))
Expand Down Expand Up @@ -248,7 +251,10 @@ def baichuan7B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
fout.write(struct.pack("i", 128))
fout.write(struct.pack("i", ftype))
fout.write(struct.pack("i", hparams["model_max_length"]))
if "max_position_embeddings" in hparams:
fout.write(struct.pack("i", hparams["max_position_embeddings"]))
else:
fout.write(struct.pack("i", hparams["model_max_length"]))
fout.write(struct.pack("f", 0))
fout.write(struct.pack("f", 0))
fout.write(struct.pack("i", 0))
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,6 +1357,8 @@ def load_some_model(path: Path) -> ModelPlus:
if path.is_dir():
# Check if it's a set of safetensors files first
files = list(path.glob("model-00001-of-*.safetensors"))
if not files:
files = list(path.glob("model*.safetensors")) # for only one safetensor
if not files:
# Try the PyTorch patterns too, with lower priority
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/llama/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

enum llama_model {
LLAMA_UNKNOWN,
TINY_LLAMA,
LLAMA_7B,
LLAMA_13B,
LLAMA_30B,
Expand All @@ -28,6 +29,12 @@ enum llama_model {

static const model_scratch llama_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 22:
return {
static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
};
case 32:
return {
static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
Expand Down
4 changes: 4 additions & 0 deletions tests/model-test/calculate_percentiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ def parse_output_file_acc(file_path):
with open(file_path, 'r', encoding='UTF-8', errors='ignore') as file:
for line in file:
accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\d\.\d+\|\±\s+\|\d\.\d+\|", line)
if accuracy_match:
accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100
continue
accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\s+\d\.\d+\|\±\s+\|\d\.\d+\|", line)
if accuracy_match:
accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100
continue
Expand Down
7 changes: 5 additions & 2 deletions tests/model-test/cpp_graph_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ model_name_map["starcoder-3b"]="bigcode/starcoder"
model_name_map["bloom-7b"]="bigscience/bloom-7b1"
model_name_map["opt-1.3b"]="facebook/opt-1.3b"
model_name_map["dolly-v2-3b"]="databricks/dolly-v2-3b"
model_name_map["chatglm3"]="THUDM/chatglm3-6b"
model_name_map["chatglm3-6b"]="THUDM/chatglm3-6b"
model_name_map["chatglm2"]="THUDM/chatglm2-6b"
model_name_map["chatglm-6b"]="THUDM/chatglm-6b"
model_name_map["baichuan2-13b"]="baichuan-inc/Baichuan2-13B-Chat"
Expand Down Expand Up @@ -363,6 +363,7 @@ function main() {
ninja
cd ..
pip install -r $working_dir/requirements.txt
pip install lm_eval
python $working_dir/setup.py install
## prepare example requirement
if [[ -f $requirements_file ]]; then
Expand Down Expand Up @@ -468,8 +469,10 @@ function main() {
chmod 777 ${WORKSPACE}/${logs_file}
if [[ ${input} == "1024" && ${cores_per_instance} == "32" ]]; then
echo "-------- Accuracy start--------"
if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" ]]; then
if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" || "${model}" == "mistral-7b" ]]; then
OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --batch_size 8 --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file}
elif [[ "${model}" == *"gptq" ]]; then
OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --use_gptq --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file}
else
OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --tasks lambada_openai --batch_size 1 2>&1 | tee -a ${WORKSPACE}/${logs_file}
fi
Expand Down

0 comments on commit af22f2a

Please sign in to comment.