feat: add llama3_1_405b_instruct_q40 to launch.py. (#112)

b4rtaz · Jul 31, 2024 · 5244daa · 5244daa
1 parent ee2c689
commit 5244daa
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -13,12 +13,13 @@ Tensor parallelism is all you need. Run LLMs on weak devices or make powerful de
 
 Python 3 and C++ compiler required. The command will download the model and the tokenizer.
 
-| Model                     | Purpose   | Size     | Command                                     |
-| ------------------------- | --------- | -------- | ------------------------------------------- |
-| TinyLlama 1.1B 3T Q40     | Benchmark | 844 MB   | `python launch.py tinyllama_1_1b_3t_q40`    |
-| Llama 3 8B Q40            | Benchmark | 6.32 GB  | `python launch.py llama3_8b_q40`            |
-| Llama 3 8B Instruct Q40   | Chat, API | 6.32 GB  | `python launch.py llama3_8b_instruct_q40`   |
-| Llama 3.1 8B Instruct Q40 | Chat, API | 6.32 GB  | `python launch.py llama3_1_8b_instruct_q40` |
+| Model                       | Purpose   | Size     | Command                                       |
+| --------------------------- | --------- | -------- | --------------------------------------------- |
+| TinyLlama 1.1B 3T Q40       | Benchmark | 844 MB   | `python launch.py tinyllama_1_1b_3t_q40`      |
+| Llama 3 8B Q40              | Benchmark | 6.32 GB  | `python launch.py llama3_8b_q40`              |
+| Llama 3 8B Instruct Q40     | Chat, API | 6.32 GB  | `python launch.py llama3_8b_instruct_q40`     |
+| Llama 3.1 8B Instruct Q40   | Chat, API | 6.32 GB  | `python launch.py llama3_1_8b_instruct_q40`   |
+| Llama 3.1 405B Instruct Q40 | Chat, API | 238 GB   | `python launch.py llama3_1_405b_instruct_q40` |
 
 ### 🛠️ Convert Model Manually
 

diff --git a/launch.py b/launch.py
@@ -2,59 +2,75 @@
 import sys
 import requests
 
-# ['model-url', 'tokenizer-url', 'weights-float-type', 'buffer-float-type', 'model-type']
+def parts(length):
+    result = []
+    for i in range(length):
+        a = chr(97 + (i // 26))
+        b = chr(97 + (i % 26))
+        result.append(a + b)
+    return result
+
+# [['model-url-0', 'model-url-1', ...], 'tokenizer-url', 'weights-float-type', 'buffer-float-type', 'model-type']
 MODELS = {
     'tinyllama_1_1b_3t_q40': [
-        'https://huggingface.co/b4rtaz/TinyLlama-1.1B-3T-Distributed-Llama/resolve/main/dllama_model_tinylama_1.1b_3t_q40.m?download=true',
+        ['https://huggingface.co/b4rtaz/TinyLlama-1.1B-3T-Distributed-Llama/resolve/main/dllama_model_tinylama_1.1b_3t_q40.m?download=true'],
         'https://huggingface.co/b4rtaz/TinyLlama-1.1B-3T-Distributed-Llama/resolve/main/dllama_tokenizer_tinylama_1.1b_3t.t?download=true',
         'q40', 'q80', 'base'
     ],
     'llama3_8b_q40': [
-        'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_model_meta-llama-3-8b_q40.m?download=true',
+        ['https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_model_meta-llama-3-8b_q40.m?download=true'],
         'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_llama3.t?download=true',
         'q40', 'q80', 'base'
     ],
     'llama3_8b_instruct_q40': [
-        'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_lama3_instruct_q40.m?download=true',
+        ['https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_lama3_instruct_q40.m?download=true'],
         'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3.t?download=true',
         'q40', 'q80', 'chat'
     ],
     'llama3_1_8b_instruct_q40': [
-        'https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.1_instruct_q40.m?download=true',
+        ['https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.1_instruct_q40.m?download=true'],
         'https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama_3_1.t?download=true',
         'q40', 'q80', 'chat'
     ],
+    'llama3_1_405b_instruct_q40': [
+        list(map(lambda suffix : f'https://huggingface.co/b4rtaz/Llama-3_1-405B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama31_405b_q40_{suffix}?download=true', parts(56))),
+        'https://huggingface.co/b4rtaz/Llama-3_1-405B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama_3_1.t?download=true',
+        'q40', 'q80', 'chat'
+    ],
 }
 
-def downloadFile(url: str, path: str):
+def downloadFile(urls: str, path: str):
     if (os.path.isfile(path)):
         fileName = os.path.basename(path)
         result = input(f'❓ {fileName} already exists, do you want to download again? ("Y" if yes): ')
         if (result.upper() != 'Y'):
             return
-    response = requests.get(url, stream=True)
-    response.raise_for_status()
-    print(f'📄 {url}')
-    lastSize = 0
+
+    lastSizeMb = 0
     with open(path, 'wb') as file:
-        for chunk in response.iter_content(chunk_size=4096):
-            file.write(chunk)
-            size = file.tell() // 1024
-            if (size - lastSize >= 8192):
-                sys.stdout.write("\rDownloaded %i kB" % size)
-                lastSize = size
+        for url in urls:
+            print(f'📄 {url}')
+            response = requests.get(url, stream=True)
+            response.raise_for_status()
+            for chunk in response.iter_content(chunk_size=4096):
+                file.write(chunk)
+                sizeMb = file.tell() // (1024 * 1024)
+                if (sizeMb != lastSizeMb):
+                    sys.stdout.write("\rDownloaded %i MB" % sizeMb)
+                    lastSizeMb = sizeMb
+            sys.stdout.write('\n')
     sys.stdout.write(' ✅\n')
 
 def download(modelName: str, model: list):
     dirPath = os.path.join('models', modelName)
     print(f'📀 Downloading {modelName} to {dirPath}...')
     os.makedirs(dirPath, exist_ok=True)
-    modelUrl = model[0]
+    modelUrls = model[0]
     tokenizerUrl = model[1]
     modelPath = os.path.join(dirPath, f'dllama_model_{modelName}.m')
     tokenizerPath = os.path.join(dirPath, f'dllama_tokenizer_{modelName}.t')
-    downloadFile(modelUrl, modelPath)
-    downloadFile(tokenizerUrl, tokenizerPath)
+    downloadFile(modelUrls, modelPath)
+    downloadFile([tokenizerUrl], tokenizerPath)
     print('📀 All files are downloaded')
     return (modelPath, tokenizerPath)