From bc31b47f85a0bb6143e709ec18def512a6da1f3b Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Tue, 28 May 2024 17:30:08 +0800
Subject: [PATCH 1/6] enable tiny_llama

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 docs/supported_models.md              | 3 ++-
 neural_speed/convert/convert_llama.py | 2 +-
 neural_speed/models/llama/llama.h     | 7 +++++++
 3 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/docs/supported_models.md b/docs/supported_models.md
index 115bc9955..23e69657f 100644
--- a/docs/supported_models.md
+++ b/docs/supported_models.md
@@ -38,7 +38,8 @@ Neural Speed supports the following models:
     <td>8192</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-7B</a>,
+    <td><a href="https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0" target="_blank" rel="noopener noreferrer">TinyLlama-1.1B</a>,
+    <a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-tB</a>,
     <a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-13B</a>,
     <a href="https://huggingface.co/meta-llama/Llama-2-70b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-70B</a></td>
     <td>✅</td>
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 25f307216..e7a08e732 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -1356,7 +1356,7 @@ def load_some_model(path: Path) -> ModelPlus:
     # Be extra-friendly and accept either a file or a directory:
     if path.is_dir():
         # Check if it's a set of safetensors files first
-        files = list(path.glob("model-00001-of-*.safetensors"))
+        files = list(path.glob("model*.safetensors"))
         if not files:
             # Try the PyTorch patterns too, with lower priority
             globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
diff --git a/neural_speed/models/llama/llama.h b/neural_speed/models/llama/llama.h
index 99fb65a72..02d44df39 100644
--- a/neural_speed/models/llama/llama.h
+++ b/neural_speed/models/llama/llama.h
@@ -20,6 +20,7 @@
 
 enum llama_model {
   LLAMA_UNKNOWN,
+  Tiny_llama,
   LLAMA_7B,
   LLAMA_13B,
   LLAMA_30B,
@@ -28,6 +29,12 @@ enum llama_model {
 
 static const model_scratch llama_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
   switch (n_layers) {
+    case 22:
+      return {
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+      };
     case 32:
       return {
           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,

From 0ae3fd65cd4066230ea16858fa6a4970815fe7ff Mon Sep 17 00:00:00 2001
From: intellinjun <105184542+intellinjun@users.noreply.github.com>
Date: Tue, 28 May 2024 18:19:28 +0800
Subject: [PATCH 2/6] Update convert_llama.py

---
 neural_speed/convert/convert_llama.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index e7a08e732..9866618e4 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -1356,7 +1356,9 @@ def load_some_model(path: Path) -> ModelPlus:
     # Be extra-friendly and accept either a file or a directory:
     if path.is_dir():
         # Check if it's a set of safetensors files first
-        files = list(path.glob("model*.safetensors"))
+        files = list(path.glob("model-00001-of-*.safetensors"))
+        if not files
+            files = list(path.glob("model*.safetensors")) # for only one safetensor
         if not files:
             # Try the PyTorch patterns too, with lower priority
             globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]

From b84772800c4d80942491ccd82f8ce6b8c0142578 Mon Sep 17 00:00:00 2001
From: intellinjun <105184542+intellinjun@users.noreply.github.com>
Date: Tue, 28 May 2024 18:21:01 +0800
Subject: [PATCH 3/6] Update llama.h

---
 neural_speed/models/llama/llama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_speed/models/llama/llama.h b/neural_speed/models/llama/llama.h
index 02d44df39..e3e5d5637 100644
--- a/neural_speed/models/llama/llama.h
+++ b/neural_speed/models/llama/llama.h
@@ -20,7 +20,7 @@
 
 enum llama_model {
   LLAMA_UNKNOWN,
-  Tiny_llama,
+  TINY_LLAMA,
   LLAMA_7B,
   LLAMA_13B,
   LLAMA_30B,

From 235cd03452754180dfe465a9894276a17d962dc7 Mon Sep 17 00:00:00 2001
From: intellinjun <105184542+intellinjun@users.noreply.github.com>
Date: Tue, 28 May 2024 19:37:51 +0800
Subject: [PATCH 4/6] Update convert_llama.py

---
 neural_speed/convert/convert_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 9866618e4..e579b0b8b 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -1357,7 +1357,7 @@ def load_some_model(path: Path) -> ModelPlus:
     if path.is_dir():
         # Check if it's a set of safetensors files first
         files = list(path.glob("model-00001-of-*.safetensors"))
-        if not files
+        if not files:
             files = list(path.glob("model*.safetensors")) # for only one safetensor
         if not files:
             # Try the PyTorch patterns too, with lower priority

From 03a2ab55639fa5b6c59e714eab5cde548f464b5d Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Wed, 29 May 2024 11:28:35 +0800
Subject: [PATCH 5/6] update ci

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/__init__.py                  | 1 +
 tests/model-test/calculate_percentiles.py | 4 ++++
 tests/model-test/cpp_graph_inference.sh   | 4 +++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index 2b98a4fbb..7e7dc3750 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -223,6 +223,7 @@ def init(self,
 
     def init_from_bin(self, model_type, model_path, **generate_kwargs):
         if self.module is None:
+            model_type = model_maps.get(model_type, model_type)
             self.module = _import_package(model_type)
         self.model = self.module.Model()
         if model_type=="whisper":
diff --git a/tests/model-test/calculate_percentiles.py b/tests/model-test/calculate_percentiles.py
index f2f32e2f6..752a4715c 100644
--- a/tests/model-test/calculate_percentiles.py
+++ b/tests/model-test/calculate_percentiles.py
@@ -37,6 +37,10 @@ def parse_output_file_acc(file_path):
     with open(file_path, 'r', encoding='UTF-8', errors='ignore') as file:
         for line in file:
             accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\d\.\d+\|\±\s+\|\d\.\d+\|", line)
+            if accuracy_match:
+                accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100
+                continue
+            accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\s+\d\.\d+\|\±\s+\|\d\.\d+\|", line)
             if accuracy_match:
                 accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100
                 continue
diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh
index afbd46188..63b7e3a8b 100644
--- a/tests/model-test/cpp_graph_inference.sh
+++ b/tests/model-test/cpp_graph_inference.sh
@@ -468,8 +468,10 @@ function main() {
                         chmod 777 ${WORKSPACE}/${logs_file}
                         if [[ ${input} == "1024" && ${cores_per_instance} == "32" ]]; then
                             echo "-------- Accuracy start--------"
-                            if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" ]]; then
+                            if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" || "${model}" == "mistral-7b" ]]; then
                                 OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --batch_size 8 --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file}
+                            elif [[ "${model}" == *"gptq" ]]; then
+                                OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --use_gptq --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file}
                             else
                                 OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --tasks lambada_openai --batch_size 1  2>&1 | tee -a ${WORKSPACE}/${logs_file}
                             fi

From 39e004525437982f0a158e276673da6b1428ac2c Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Wed, 29 May 2024 17:04:44 +0800
Subject: [PATCH 6/6] update ci

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/convert/convert_baichuan.py | 10 ++++++++--
 tests/model-test/cpp_graph_inference.sh  |  3 ++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
index fea0641a9..62595d996 100644
--- a/neural_speed/convert/convert_baichuan.py
+++ b/neural_speed/convert/convert_baichuan.py
@@ -144,7 +144,10 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", hparams["num_hidden_layers"]))
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", ftype))
-    fout.write(struct.pack("i", hparams["model_max_length"]))
+    if "max_position_embeddings" in hparams:
+        fout.write(struct.pack("i", hparams["max_position_embeddings"]))
+    else:
+        fout.write(struct.pack("i", hparams["model_max_length"]))
     fout.write(struct.pack("f", 0))
     fout.write(struct.pack("f", 0))
     fout.write(struct.pack("i", 0))
@@ -248,7 +251,10 @@ def baichuan7B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", hparams["num_hidden_layers"]))
     fout.write(struct.pack("i", 128))
     fout.write(struct.pack("i", ftype))
-    fout.write(struct.pack("i", hparams["model_max_length"]))
+    if "max_position_embeddings" in hparams:
+        fout.write(struct.pack("i", hparams["max_position_embeddings"]))
+    else:
+        fout.write(struct.pack("i", hparams["model_max_length"]))
     fout.write(struct.pack("f", 0))
     fout.write(struct.pack("f", 0))
     fout.write(struct.pack("i", 0))
diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh
index 63b7e3a8b..b8886fa39 100644
--- a/tests/model-test/cpp_graph_inference.sh
+++ b/tests/model-test/cpp_graph_inference.sh
@@ -146,7 +146,7 @@ model_name_map["starcoder-3b"]="bigcode/starcoder"
 model_name_map["bloom-7b"]="bigscience/bloom-7b1"
 model_name_map["opt-1.3b"]="facebook/opt-1.3b"
 model_name_map["dolly-v2-3b"]="databricks/dolly-v2-3b"
-model_name_map["chatglm3"]="THUDM/chatglm3-6b"
+model_name_map["chatglm3-6b"]="THUDM/chatglm3-6b"
 model_name_map["chatglm2"]="THUDM/chatglm2-6b"
 model_name_map["chatglm-6b"]="THUDM/chatglm-6b"
 model_name_map["baichuan2-13b"]="baichuan-inc/Baichuan2-13B-Chat"
@@ -363,6 +363,7 @@ function main() {
     ninja
     cd ..
     pip install -r $working_dir/requirements.txt
+    pip install lm_eval
     python $working_dir/setup.py install
     ## prepare example requirement
     if [[ -f $requirements_file ]]; then