intel · a32543254 · May 30, 2024 · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -38,7 +38,8 @@ Neural Speed supports the following models:
     <td>8192</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-7B</a>,
+    <td><a href="https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0" target="_blank" rel="noopener noreferrer">TinyLlama-1.1B</a>,
+    <a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-tB</a>,
     <a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-13B</a>,
     <a href="https://huggingface.co/meta-llama/Llama-2-70b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-70B</a></td>
     <td>✅</td>

diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
@@ -1357,6 +1357,8 @@ def load_some_model(path: Path) -> ModelPlus:
     if path.is_dir():
         # Check if it's a set of safetensors files first
         files = list(path.glob("model-00001-of-*.safetensors"))
+        if not files:
+            files = list(path.glob("model*.safetensors")) # for only one safetensor
         if not files:
             # Try the PyTorch patterns too, with lower priority
             globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]

diff --git a/neural_speed/models/llama/llama.h b/neural_speed/models/llama/llama.h
@@ -20,6 +20,7 @@
 
 enum llama_model {
   LLAMA_UNKNOWN,
+  TINY_LLAMA,
   LLAMA_7B,
   LLAMA_13B,
   LLAMA_30B,
@@ -28,6 +29,12 @@ enum llama_model {
 
 static const model_scratch llama_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
   switch (n_layers) {
+    case 22:
+      return {
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+      };
     case 32:
       return {
           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,