From bc31b47f85a0bb6143e709ec18def512a6da1f3b Mon Sep 17 00:00:00 2001 From: intellinjun Date: Tue, 28 May 2024 17:30:08 +0800 Subject: [PATCH 1/6] enable tiny_llama Signed-off-by: intellinjun --- docs/supported_models.md | 3 ++- neural_speed/convert/convert_llama.py | 2 +- neural_speed/models/llama/llama.h | 7 +++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/supported_models.md b/docs/supported_models.md index 115bc9955..23e69657f 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -38,7 +38,8 @@ Neural Speed supports the following models: 8192 - LLaMA2-7B, + TinyLlama-1.1B, + LLaMA2-tB, LLaMA2-13B, LLaMA2-70B ✅ diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index 25f307216..e7a08e732 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -1356,7 +1356,7 @@ def load_some_model(path: Path) -> ModelPlus: # Be extra-friendly and accept either a file or a directory: if path.is_dir(): # Check if it's a set of safetensors files first - files = list(path.glob("model-00001-of-*.safetensors")) + files = list(path.glob("model*.safetensors")) if not files: # Try the PyTorch patterns too, with lower priority globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] diff --git a/neural_speed/models/llama/llama.h b/neural_speed/models/llama/llama.h index 99fb65a72..02d44df39 100644 --- a/neural_speed/models/llama/llama.h +++ b/neural_speed/models/llama/llama.h @@ -20,6 +20,7 @@ enum llama_model { LLAMA_UNKNOWN, + Tiny_llama, LLAMA_7B, LLAMA_13B, LLAMA_30B, @@ -28,6 +29,12 @@ enum llama_model { static const model_scratch llama_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { switch (n_layers) { + case 22: + return { + static_cast(scratch_size_ratio * 4096) * MB, + static_cast(scratch_size_ratio * 2048) * MB, + static_cast(scratch_size_ratio * 4096) * MB, + }; case 32: return { static_cast(scratch_size_ratio * 4096) * MB, From 0ae3fd65cd4066230ea16858fa6a4970815fe7ff Mon Sep 17 00:00:00 2001 From: intellinjun <105184542+intellinjun@users.noreply.github.com> Date: Tue, 28 May 2024 18:19:28 +0800 Subject: [PATCH 2/6] Update convert_llama.py --- neural_speed/convert/convert_llama.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index e7a08e732..9866618e4 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -1356,7 +1356,9 @@ def load_some_model(path: Path) -> ModelPlus: # Be extra-friendly and accept either a file or a directory: if path.is_dir(): # Check if it's a set of safetensors files first - files = list(path.glob("model*.safetensors")) + files = list(path.glob("model-00001-of-*.safetensors")) + if not files + files = list(path.glob("model*.safetensors")) # for only one safetensor if not files: # Try the PyTorch patterns too, with lower priority globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] From b84772800c4d80942491ccd82f8ce6b8c0142578 Mon Sep 17 00:00:00 2001 From: intellinjun <105184542+intellinjun@users.noreply.github.com> Date: Tue, 28 May 2024 18:21:01 +0800 Subject: [PATCH 3/6] Update llama.h --- neural_speed/models/llama/llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_speed/models/llama/llama.h b/neural_speed/models/llama/llama.h index 02d44df39..e3e5d5637 100644 --- a/neural_speed/models/llama/llama.h +++ b/neural_speed/models/llama/llama.h @@ -20,7 +20,7 @@ enum llama_model { LLAMA_UNKNOWN, - Tiny_llama, + TINY_LLAMA, LLAMA_7B, LLAMA_13B, LLAMA_30B, From 235cd03452754180dfe465a9894276a17d962dc7 Mon Sep 17 00:00:00 2001 From: intellinjun <105184542+intellinjun@users.noreply.github.com> Date: Tue, 28 May 2024 19:37:51 +0800 Subject: [PATCH 4/6] Update convert_llama.py --- neural_speed/convert/convert_llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index 9866618e4..e579b0b8b 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -1357,7 +1357,7 @@ def load_some_model(path: Path) -> ModelPlus: if path.is_dir(): # Check if it's a set of safetensors files first files = list(path.glob("model-00001-of-*.safetensors")) - if not files + if not files: files = list(path.glob("model*.safetensors")) # for only one safetensor if not files: # Try the PyTorch patterns too, with lower priority From 03a2ab55639fa5b6c59e714eab5cde548f464b5d Mon Sep 17 00:00:00 2001 From: intellinjun Date: Wed, 29 May 2024 11:28:35 +0800 Subject: [PATCH 5/6] update ci Signed-off-by: intellinjun --- neural_speed/__init__.py | 1 + tests/model-test/calculate_percentiles.py | 4 ++++ tests/model-test/cpp_graph_inference.sh | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py index 2b98a4fbb..7e7dc3750 100644 --- a/neural_speed/__init__.py +++ b/neural_speed/__init__.py @@ -223,6 +223,7 @@ def init(self, def init_from_bin(self, model_type, model_path, **generate_kwargs): if self.module is None: + model_type = model_maps.get(model_type, model_type) self.module = _import_package(model_type) self.model = self.module.Model() if model_type=="whisper": diff --git a/tests/model-test/calculate_percentiles.py b/tests/model-test/calculate_percentiles.py index f2f32e2f6..752a4715c 100644 --- a/tests/model-test/calculate_percentiles.py +++ b/tests/model-test/calculate_percentiles.py @@ -37,6 +37,10 @@ def parse_output_file_acc(file_path): with open(file_path, 'r', encoding='UTF-8', errors='ignore') as file: for line in file: accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\d\.\d+\|\±\s+\|\d\.\d+\|", line) + if accuracy_match: + accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100 + continue + accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\s+\d\.\d+\|\±\s+\|\d\.\d+\|", line) if accuracy_match: accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100 continue diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh index afbd46188..63b7e3a8b 100644 --- a/tests/model-test/cpp_graph_inference.sh +++ b/tests/model-test/cpp_graph_inference.sh @@ -468,8 +468,10 @@ function main() { chmod 777 ${WORKSPACE}/${logs_file} if [[ ${input} == "1024" && ${cores_per_instance} == "32" ]]; then echo "-------- Accuracy start--------" - if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" ]]; then + if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" || "${model}" == "mistral-7b" ]]; then OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --batch_size 8 --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file} + elif [[ "${model}" == *"gptq" ]]; then + OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --use_gptq --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file} else OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --tasks lambada_openai --batch_size 1 2>&1 | tee -a ${WORKSPACE}/${logs_file} fi From 39e004525437982f0a158e276673da6b1428ac2c Mon Sep 17 00:00:00 2001 From: intellinjun Date: Wed, 29 May 2024 17:04:44 +0800 Subject: [PATCH 6/6] update ci Signed-off-by: intellinjun --- neural_speed/convert/convert_baichuan.py | 10 ++++++++-- tests/model-test/cpp_graph_inference.sh | 3 ++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py index fea0641a9..62595d996 100644 --- a/neural_speed/convert/convert_baichuan.py +++ b/neural_speed/convert/convert_baichuan.py @@ -144,7 +144,10 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", ftype)) - fout.write(struct.pack("i", hparams["model_max_length"])) + if "max_position_embeddings" in hparams: + fout.write(struct.pack("i", hparams["max_position_embeddings"])) + else: + fout.write(struct.pack("i", hparams["model_max_length"])) fout.write(struct.pack("f", 0)) fout.write(struct.pack("f", 0)) fout.write(struct.pack("i", 0)) @@ -248,7 +251,10 @@ def baichuan7B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", 128)) fout.write(struct.pack("i", ftype)) - fout.write(struct.pack("i", hparams["model_max_length"])) + if "max_position_embeddings" in hparams: + fout.write(struct.pack("i", hparams["max_position_embeddings"])) + else: + fout.write(struct.pack("i", hparams["model_max_length"])) fout.write(struct.pack("f", 0)) fout.write(struct.pack("f", 0)) fout.write(struct.pack("i", 0)) diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh index 63b7e3a8b..b8886fa39 100644 --- a/tests/model-test/cpp_graph_inference.sh +++ b/tests/model-test/cpp_graph_inference.sh @@ -146,7 +146,7 @@ model_name_map["starcoder-3b"]="bigcode/starcoder" model_name_map["bloom-7b"]="bigscience/bloom-7b1" model_name_map["opt-1.3b"]="facebook/opt-1.3b" model_name_map["dolly-v2-3b"]="databricks/dolly-v2-3b" -model_name_map["chatglm3"]="THUDM/chatglm3-6b" +model_name_map["chatglm3-6b"]="THUDM/chatglm3-6b" model_name_map["chatglm2"]="THUDM/chatglm2-6b" model_name_map["chatglm-6b"]="THUDM/chatglm-6b" model_name_map["baichuan2-13b"]="baichuan-inc/Baichuan2-13B-Chat" @@ -363,6 +363,7 @@ function main() { ninja cd .. pip install -r $working_dir/requirements.txt + pip install lm_eval python $working_dir/setup.py install ## prepare example requirement if [[ -f $requirements_file ]]; then