diff --git a/docs/supported_models.md b/docs/supported_models.md index 115bc9955..23e69657f 100644 --- a/docs/supported_models.md +++ b/docs/supported_models.md @@ -38,7 +38,8 @@ Neural Speed supports the following models: 8192 - LLaMA2-7B, + TinyLlama-1.1B, + LLaMA2-tB, LLaMA2-13B, LLaMA2-70B ✅ diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py index 2b98a4fbb..7e7dc3750 100644 --- a/neural_speed/__init__.py +++ b/neural_speed/__init__.py @@ -223,6 +223,7 @@ def init(self, def init_from_bin(self, model_type, model_path, **generate_kwargs): if self.module is None: + model_type = model_maps.get(model_type, model_type) self.module = _import_package(model_type) self.model = self.module.Model() if model_type=="whisper": diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py index fea0641a9..62595d996 100644 --- a/neural_speed/convert/convert_baichuan.py +++ b/neural_speed/convert/convert_baichuan.py @@ -144,7 +144,10 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", 0)) fout.write(struct.pack("i", ftype)) - fout.write(struct.pack("i", hparams["model_max_length"])) + if "max_position_embeddings" in hparams: + fout.write(struct.pack("i", hparams["max_position_embeddings"])) + else: + fout.write(struct.pack("i", hparams["model_max_length"])) fout.write(struct.pack("f", 0)) fout.write(struct.pack("f", 0)) fout.write(struct.pack("i", 0)) @@ -248,7 +251,10 @@ def baichuan7B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", 128)) fout.write(struct.pack("i", ftype)) - fout.write(struct.pack("i", hparams["model_max_length"])) + if "max_position_embeddings" in hparams: + fout.write(struct.pack("i", hparams["max_position_embeddings"])) + else: + fout.write(struct.pack("i", hparams["model_max_length"])) fout.write(struct.pack("f", 0)) fout.write(struct.pack("f", 0)) fout.write(struct.pack("i", 0)) diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index 25f307216..e579b0b8b 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -1357,6 +1357,8 @@ def load_some_model(path: Path) -> ModelPlus: if path.is_dir(): # Check if it's a set of safetensors files first files = list(path.glob("model-00001-of-*.safetensors")) + if not files: + files = list(path.glob("model*.safetensors")) # for only one safetensor if not files: # Try the PyTorch patterns too, with lower priority globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] diff --git a/neural_speed/models/llama/llama.h b/neural_speed/models/llama/llama.h index 99fb65a72..e3e5d5637 100644 --- a/neural_speed/models/llama/llama.h +++ b/neural_speed/models/llama/llama.h @@ -20,6 +20,7 @@ enum llama_model { LLAMA_UNKNOWN, + TINY_LLAMA, LLAMA_7B, LLAMA_13B, LLAMA_30B, @@ -28,6 +29,12 @@ enum llama_model { static const model_scratch llama_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { switch (n_layers) { + case 22: + return { + static_cast(scratch_size_ratio * 4096) * MB, + static_cast(scratch_size_ratio * 2048) * MB, + static_cast(scratch_size_ratio * 4096) * MB, + }; case 32: return { static_cast(scratch_size_ratio * 4096) * MB, diff --git a/tests/model-test/calculate_percentiles.py b/tests/model-test/calculate_percentiles.py index f2f32e2f6..752a4715c 100644 --- a/tests/model-test/calculate_percentiles.py +++ b/tests/model-test/calculate_percentiles.py @@ -37,6 +37,10 @@ def parse_output_file_acc(file_path): with open(file_path, 'r', encoding='UTF-8', errors='ignore') as file: for line in file: accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\d\.\d+\|\±\s+\|\d\.\d+\|", line) + if accuracy_match: + accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100 + continue + accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\s+\d\.\d+\|\±\s+\|\d\.\d+\|", line) if accuracy_match: accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100 continue diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh index afbd46188..b8886fa39 100644 --- a/tests/model-test/cpp_graph_inference.sh +++ b/tests/model-test/cpp_graph_inference.sh @@ -146,7 +146,7 @@ model_name_map["starcoder-3b"]="bigcode/starcoder" model_name_map["bloom-7b"]="bigscience/bloom-7b1" model_name_map["opt-1.3b"]="facebook/opt-1.3b" model_name_map["dolly-v2-3b"]="databricks/dolly-v2-3b" -model_name_map["chatglm3"]="THUDM/chatglm3-6b" +model_name_map["chatglm3-6b"]="THUDM/chatglm3-6b" model_name_map["chatglm2"]="THUDM/chatglm2-6b" model_name_map["chatglm-6b"]="THUDM/chatglm-6b" model_name_map["baichuan2-13b"]="baichuan-inc/Baichuan2-13B-Chat" @@ -363,6 +363,7 @@ function main() { ninja cd .. pip install -r $working_dir/requirements.txt + pip install lm_eval python $working_dir/setup.py install ## prepare example requirement if [[ -f $requirements_file ]]; then @@ -468,8 +469,10 @@ function main() { chmod 777 ${WORKSPACE}/${logs_file} if [[ ${input} == "1024" && ${cores_per_instance} == "32" ]]; then echo "-------- Accuracy start--------" - if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" ]]; then + if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" || "${model}" == "mistral-7b" ]]; then OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --batch_size 8 --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file} + elif [[ "${model}" == *"gptq" ]]; then + OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --use_gptq --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file} else OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --tasks lambada_openai --batch_size 1 2>&1 | tee -a ${WORKSPACE}/${logs_file} fi