diff --git a/docs/supported_models.md b/docs/supported_models.md
index 115bc9955..23e69657f 100644
--- a/docs/supported_models.md
+++ b/docs/supported_models.md
@@ -38,7 +38,8 @@ Neural Speed supports the following models:
8192 |
- LLaMA2-7B,
+ | TinyLlama-1.1B,
+ LLaMA2-tB,
LLaMA2-13B,
LLaMA2-70B |
✅ |
diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index 2b98a4fbb..7e7dc3750 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -223,6 +223,7 @@ def init(self,
def init_from_bin(self, model_type, model_path, **generate_kwargs):
if self.module is None:
+ model_type = model_maps.get(model_type, model_type)
self.module = _import_package(model_type)
self.model = self.module.Model()
if model_type=="whisper":
diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
index fea0641a9..62595d996 100644
--- a/neural_speed/convert/convert_baichuan.py
+++ b/neural_speed/convert/convert_baichuan.py
@@ -144,7 +144,10 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
fout.write(struct.pack("i", 0))
fout.write(struct.pack("i", ftype))
- fout.write(struct.pack("i", hparams["model_max_length"]))
+ if "max_position_embeddings" in hparams:
+ fout.write(struct.pack("i", hparams["max_position_embeddings"]))
+ else:
+ fout.write(struct.pack("i", hparams["model_max_length"]))
fout.write(struct.pack("f", 0))
fout.write(struct.pack("f", 0))
fout.write(struct.pack("i", 0))
@@ -248,7 +251,10 @@ def baichuan7B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
fout.write(struct.pack("i", 128))
fout.write(struct.pack("i", ftype))
- fout.write(struct.pack("i", hparams["model_max_length"]))
+ if "max_position_embeddings" in hparams:
+ fout.write(struct.pack("i", hparams["max_position_embeddings"]))
+ else:
+ fout.write(struct.pack("i", hparams["model_max_length"]))
fout.write(struct.pack("f", 0))
fout.write(struct.pack("f", 0))
fout.write(struct.pack("i", 0))
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 25f307216..e579b0b8b 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -1357,6 +1357,8 @@ def load_some_model(path: Path) -> ModelPlus:
if path.is_dir():
# Check if it's a set of safetensors files first
files = list(path.glob("model-00001-of-*.safetensors"))
+ if not files:
+ files = list(path.glob("model*.safetensors")) # for only one safetensor
if not files:
# Try the PyTorch patterns too, with lower priority
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
diff --git a/neural_speed/models/llama/llama.h b/neural_speed/models/llama/llama.h
index 99fb65a72..e3e5d5637 100644
--- a/neural_speed/models/llama/llama.h
+++ b/neural_speed/models/llama/llama.h
@@ -20,6 +20,7 @@
enum llama_model {
LLAMA_UNKNOWN,
+ TINY_LLAMA,
LLAMA_7B,
LLAMA_13B,
LLAMA_30B,
@@ -28,6 +29,12 @@ enum llama_model {
static const model_scratch llama_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
+ case 22:
+ return {
+ static_cast(scratch_size_ratio * 4096) * MB,
+ static_cast(scratch_size_ratio * 2048) * MB,
+ static_cast(scratch_size_ratio * 4096) * MB,
+ };
case 32:
return {
static_cast(scratch_size_ratio * 4096) * MB,
diff --git a/tests/model-test/calculate_percentiles.py b/tests/model-test/calculate_percentiles.py
index f2f32e2f6..752a4715c 100644
--- a/tests/model-test/calculate_percentiles.py
+++ b/tests/model-test/calculate_percentiles.py
@@ -37,6 +37,10 @@ def parse_output_file_acc(file_path):
with open(file_path, 'r', encoding='UTF-8', errors='ignore') as file:
for line in file:
accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\d\.\d+\|\±\s+\|\d\.\d+\|", line)
+ if accuracy_match:
+ accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100
+ continue
+ accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\s+\d\.\d+\|\±\s+\|\d\.\d+\|", line)
if accuracy_match:
accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100
continue
diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh
index afbd46188..b8886fa39 100644
--- a/tests/model-test/cpp_graph_inference.sh
+++ b/tests/model-test/cpp_graph_inference.sh
@@ -146,7 +146,7 @@ model_name_map["starcoder-3b"]="bigcode/starcoder"
model_name_map["bloom-7b"]="bigscience/bloom-7b1"
model_name_map["opt-1.3b"]="facebook/opt-1.3b"
model_name_map["dolly-v2-3b"]="databricks/dolly-v2-3b"
-model_name_map["chatglm3"]="THUDM/chatglm3-6b"
+model_name_map["chatglm3-6b"]="THUDM/chatglm3-6b"
model_name_map["chatglm2"]="THUDM/chatglm2-6b"
model_name_map["chatglm-6b"]="THUDM/chatglm-6b"
model_name_map["baichuan2-13b"]="baichuan-inc/Baichuan2-13B-Chat"
@@ -363,6 +363,7 @@ function main() {
ninja
cd ..
pip install -r $working_dir/requirements.txt
+ pip install lm_eval
python $working_dir/setup.py install
## prepare example requirement
if [[ -f $requirements_file ]]; then
@@ -468,8 +469,10 @@ function main() {
chmod 777 ${WORKSPACE}/${logs_file}
if [[ ${input} == "1024" && ${cores_per_instance} == "32" ]]; then
echo "-------- Accuracy start--------"
- if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" ]]; then
+ if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" || "${model}" == "mistral-7b" ]]; then
OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --batch_size 8 --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file}
+ elif [[ "${model}" == *"gptq" ]]; then
+ OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --use_gptq --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file}
else
OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --tasks lambada_openai --batch_size 1 2>&1 | tee -a ${WORKSPACE}/${logs_file}
fi