feat: optional --weights-float-type argument. (#84)

b4rtaz · Jun 1, 2024 · 08b4bcf · 08b4bcf
1 parent ecdadee
commit 08b4bcf
Show file tree

Hide file tree

Showing 9 changed files with 40 additions and 162 deletions.
diff --git a/README.md b/README.md
@@ -4,28 +4,30 @@
 
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/b4rtaz/distributed-llama/.github%2Fworkflows%2Fmain.yml?style=flat-square)](https://github.com/b4rtaz/distributed-llama/actions) [![License: MIT](https://img.shields.io/github/license/mashape/apistatus.svg?style=flat-square)](/LICENSE) [![Discord](https://img.shields.io/discord/1245814812353495070?style=flat-square&color=%23788AD1)](https://discord.gg/7M4BXkM4)
 
-
 Tensor parallelism is all you need. Run LLMs on weak devices or make powerful devices even more powerful by distributing the workload and dividing the RAM usage. This project proves that it's possible split the workload of LLMs across multiple devices and achieve a significant speedup. Distributed Llama allows you to run huge LLMs in-house. The project uses TCP sockets to synchronize the state. You can easily configure your AI cluster by using a home router.
 
 <p align="center">
   <img src=".github/8raspi.jpg" width="50%" alt="Distributed Llama running on 8 Raspberry Pi 4B devices" /><br />
   <sub><sup>Distributed Llama running Llama 2 70B on 8 Raspberry Pi 4B devices</sup></sub>
 </p>
 
-<!--
-**🔥 Run Distributed Llama by single command**
+**🔥 Download Model & Build by Single Command**
+
+Python 3 and GCC required.
+
+**Chat & API**
 
-Python and GCC required. Download this repository and run:
+* Llama 3 8B Instruct: `python launch.py llama3_instruct`
 
-* Llama 3 8B: `python download-model.py llama3`
-* Llama 3 8B Instruct: `python download-model.py llama3_instruct`
-* TinyLlama: `python download-model.py tinylama`
--->
+**Convert Model Manually**
+
+* [Llama 2](./docs/LLAMA.md#how-to-run-llama-2)
+* [Llama 3](./docs/LLAMA.md#how-to-run-llama-3)
 
 **Supported modes:**
 
-- Inference
-- Chat
+- Inference CLI
+- Chat CLI
 - [API Server](./src/apps/dllama-api/README.md)
 
 **Known limitations:**
@@ -112,11 +114,6 @@ All tests below were conducted on c3d-highcpu-30 (30 vCPU, 15 core, 59 GB memory
 
 <sub><sup>S - sent data from the root node to workers, R - received data by the root node from workers, tested on 0.7.1 version</sup></sub>
 
-## Download Model and Run
-
-* [How to Run Llama 2](./docs/LLAMA.md#how-to-run-llama-2)
-* [How to Run Llama 3](./docs/LLAMA.md#how-to-run-llama-3)
-
 ## 📟 How to Run on Raspberry Pi Devices
 
 1. Install `Raspberry Pi OS Lite (64 bit)` on your Raspberry Pi devices. This OS doesn't have desktop environment.

diff --git a/converter/convert-grok-1.py b/converter/convert-grok-1.py
diff --git a/converter/convert-llama.py b/converter/convert-llama.py
@@ -22,6 +22,7 @@ def convert(modelPath, outputPath, targetFloatType):
         params['arch_type'] = 0xABCD00
         params['n_experts'] = 0
         params['n_active_experts'] = 0
+        params['weights_float_type'] = targetFloatType
         if ('rope_theta' in params):
             params['rope_theta'] = int(params['rope_theta'])
 
@@ -107,8 +108,8 @@ def usage():
     modelPath = sys.argv[1]
     targetFloatType = parseFloatType(sys.argv[2])
 
-    modelName = modelPath.split('/')[-1]
-    outputFileName = f'dllama_model_{modelName.lower()}_{targetFloatType}.m'
+    modelName = os.path.basename(modelPath)
+    outputFileName = f'dllama_model_{modelName.lower()}_{sys.argv[2]}.m'
 
     print(f'Model name: {modelName}')
     print(f'Target float type: {targetFloatType}')

diff --git a/converter/convert-tokenizer-llama3.py b/converter/convert-tokenizer-llama3.py
@@ -1,5 +1,4 @@
 import sys
-import struct
 import base64
 writer = __import__('tokenizer-writer')
 

diff --git a/download-model.py → launch.py b/download-model.py → launch.py
@@ -2,27 +2,17 @@
 import sys
 import requests
 
+# ['model-url', 'tokenizer-url', 'weights-float-type', 'buffer-float-type', 'model-type']
 MODELS = {
-    'llama3_8b_q40': [
-        'https://huggingface.co/b4rtaz/llama-3-8b-distributed-llama/resolve/main/dllama_meta-llama-3-8b_q40.bin?download=true',
-        'https://huggingface.co/b4rtaz/llama-3-8b-distributed-llama/resolve/main/dllama_meta-llama3-tokenizer.t?download=true',
-    ],
     'llama3_8b_instruct_q40': [
-        'https://huggingface.co/Azamorn/Meta-Llama-3-8B-Instruct-Distributed/resolve/main/dllama_original_q40.bin?download=true',
-        'https://huggingface.co/Azamorn/Meta-Llama-3-8B-Instruct-Distributed/resolve/main/dllama-llama3-tokenizer.t?download=true',
-    ],
-    'tinylama_1.1b_3t_q40': [
-        'https://huggingface.co/b4rtaz/tinyllama-1.1b-1431k-3t-distributed-llama/resolve/main/dllama_model_tinylama_1.1b_3t_q40.m?download=true',
-        'https://huggingface.co/b4rtaz/tinyllama-1.1b-1431k-3t-distributed-llama/resolve/main/dllama_tokenizer_tinylama_1.1b_3t_q40.t?download=true'
+        'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_lama3_instruct_q40.m?download=true',
+        'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3.t?download=true',
+        'q40', 'q80', 'chat'
     ]
 }
 
 ALIASES = {
-    'llama3': 'llama3_8b_q40',
-    'llama3_8b': 'llama3_8b_q40',
-    'llama3_instruct': 'llama3_8b_instruct_q40',
-    'llama3_8b_instruct': 'llama3_8b_instruct_q40',
-    'tinylama': 'tinylama_1.1b_3t_q40'
+    'llama3_instruct': 'llama3_8b_instruct_q40'
 }
 
 def downloadFile(url: str, path: str):
@@ -80,8 +70,13 @@ def printUsage():
         print(f'Model is not supported: {modelName}')
         exit(1)
 
-    (modelPath, tokenizerPath) = download(modelName, MODELS[modelName])
-    command = f'./dllama inference --model {modelPath} --tokenizer {tokenizerPath} --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --steps 64 --prompt "Hello world"'
+    model = MODELS[modelName]
+    (modelPath, tokenizerPath) = download(modelName, model)
+    if (model[4] == 'chat'):
+        command = './dllama chat'
+    else:
+        command = './dllama inference --steps 64 --prompt "Hello world"'
+    command += f' --model {modelPath} --tokenizer {tokenizerPath} --buffer-float-type {model[3]} --nthreads 4'
 
     print('To run Distributed Llama you need to execute:')
     print('--- copy start ---')

diff --git a/src/app.cpp b/src/app.cpp
@@ -23,7 +23,7 @@ AppArgs AppArgs::parse(int argc, char** argv, bool hasMode) {
     args.modelPath = NULL;
     args.tokenizerPath = NULL;
     args.prompt = NULL;
-    args.weightsFloatType = F32;
+    args.weightsFloatType = FUNK;
     args.bufferFloatType = F32;
     args.nWorkers = 0;
     args.port = 9990;

diff --git a/src/quants.cpp b/src/quants.cpp
@@ -18,6 +18,8 @@ int getNumbersPerBatch(FloatType type) {
             return QK40;
         case Q80:
             return QK80;
+        case FUNK:
+            break;
     }
     fprintf(stderr, "Unsupported float type %d\n", type);
     exit(EXIT_FAILURE);
@@ -41,6 +43,8 @@ long getBatchBytes(FloatType type, int n, int d) {
                 int blocks = n / QK80 * d;
                 return blocks * sizeof(BlockQ80);
             }
+        case FUNK:
+            break;
     }
     fprintf(stderr, "Unsupported float type %d\n", type);
     exit(EXIT_FAILURE);

diff --git a/src/quants.hpp b/src/quants.hpp
@@ -4,6 +4,7 @@
 #include <cstdint>
 
 enum FloatType {
+    FUNK = -1,
     F32 = 0,
     F16 = 1,
     Q40 = 2,

diff --git a/src/transformer.cpp b/src/transformer.cpp
@@ -220,7 +220,8 @@ TransformerSpec Transformer::loadSpecFromFile(const char* path, const unsigned i
             throw std::runtime_error("Cannot read header values");
         }
         int nKv = (spec.headerSize - 2 * sizeof(int)) / sizeof(int);
-        int modelWeightsFloatType = -1;
+
+        FloatType modelWeightsFloatType = FUNK;
         for (int i = 0; i < nKv; i += 2) {
             int key = buffer[i];
             int value = buffer[i + 1];
@@ -237,18 +238,20 @@ TransformerSpec Transformer::loadSpecFromFile(const char* path, const unsigned i
             else if (key == SEQ_LEN) spec.seqLen = value;
             else if (key == HIDDEN_ACT) spec.hiddenAct = (TransformerHiddenAct)value;
             else if (key == ROPE_THETA) spec.ropeTheta = (float)value;
-            else if (key == WEIGHTS_FLOAT_TYPE) modelWeightsFloatType = value;
+            else if (key == WEIGHTS_FLOAT_TYPE) weightsFloatType = (FloatType)value;
             else {
                 throw std::runtime_error("Unsupported header key");
             }
         }
 
-        if (modelWeightsFloatType != -1 && modelWeightsFloatType != weightsFloatType)
-            throw std::runtime_error("Model has different weights float type than passed in argument");
+
     } else {
         throw std::runtime_error("Unsupported model file");
     }
 
+    if (weightsFloatType == FUNK)
+        throw std::runtime_error("Not specified weights float type");
+
     spec.headSize = spec.dim / spec.nHeads;
     spec.kvDim = (spec.dim * spec.nKvHeads) / spec.nHeads;
     spec.weightsFloatType = weightsFloatType;