From 595a5658fc2aec9737f41a6de2c1cefa5120f894 Mon Sep 17 00:00:00 2001
From: Bart Tadych <b4rtaz@gmail.com>
Date: Sat, 1 Jun 2024 22:03:42 +0200
Subject: [PATCH] feat: update readme, add model. (#85)

---
 README.md                  | 160 ++++++++++++++++---------------------
 converter/convert-llama.py |   7 +-
 converter/writer.py        |   5 +-
 launch.py                  |  16 ++--
 4 files changed, 85 insertions(+), 103 deletions(-)
diff --git a/README.md b/README.md
index 4204878..2d4d7c9 100644
--- a/README.md
+++ b/README.md
@@ -11,26 +11,23 @@ Tensor parallelism is all you need. Run LLMs on weak devices or make powerful de
   <sub><sup>Distributed Llama running Llama 2 70B on 8 Raspberry Pi 4B devices</sup></sub>
 </p>
 
-**🔥 Download Model & Build by Single Command**
+### 🔥 Start by Single Command
 
-Python 3 and GCC required.
+Python 3 and C++ compiler required.
 
-**Chat & API**
+| Model                   | Purpose   | Size     | Command                                   |
+| ----------------------- | --------- | -------- | ----------------------------------------- |
+| Llama 3 8B Q40          | Benchmark | 6.32 GB  | `python launch.py llama3_8b_q40`          |
+| Llama 3 8B Instruct Q40 | Chat, API | 6.32 GB  | `python launch.py llama3_8b_instruct_q40` |
 
-* Llama 3 8B Instruct: `python launch.py llama3_instruct`
+### 🛠️ Convert Model Manually
 
-**Convert Model Manually**
+Supported architectures: Llama, Mixtral, Grok
 
-* [Llama 2](./docs/LLAMA.md#how-to-run-llama-2)
-* [Llama 3](./docs/LLAMA.md#how-to-run-llama-3)
+* [How to Convert Llama 2, Llama 3](./docs/LLAMA.md)
 
-**Supported modes:**
+### 🚧 Known Limitations
 
-- Inference CLI
-- Chat CLI
-- [API Server](./src/apps/dllama-api/README.md)
-
-**Known limitations:**
 * You can run Distributed Llama only on 1, 2, 4... 2^n nodes.
 * The maximum number of nodes is equal to the number of KV heads in the model [#70](https://github.com/b4rtaz/distributed-llama/issues/70). 
 * Optimized for (weights format × buffer format):
@@ -45,7 +42,8 @@ Python 3 and GCC required.
     * ❌ Q40 × F32
     * ✅ Q40 × Q80
 
-**Architecture**<br />
+### 👷 Architecture
+
 The project is split up into two parts:
 * **Root node** - it's responsible for loading the model and weights and forward them to workers. Also, it synchronizes the state of the neural network. The root node is also a worker, it processes own slice of the neural network.
 * **Worker node** - it processes own slice of the neural network. It doesn't require any configuration related to the model.
@@ -54,70 +52,62 @@ You always need the root node and you can add 2^n - 1 worker nodes to speed up t
 
 ## 📊 Measurements
 
-### Average Single Token Generation Time
+### Average Token Generation Time
 
-All tests below utilized Q40 weights and a Q80 buffer. The generation time encompasses the inference time, network transfer time, sampling time, and multi-thread synchronization time. Number of samples: 16.
+I - inference time of the root node, T - network transfer time of the root node.
 
 **Raspberry Pi 5 8GB**
 
+<sub><sup>Weights = Q40, Buffer = Q80, nSamples = 16, switch = TP-Link LS1008G, tested on 0.3.1 version</sup></sub>
+
 | Model       | 1 x RasPi 5 8 GB                                                    | 2 x RasPi 5 8 GB                                                    | 4 x RasPi 5 8 GB                                                    |
 |-------------|---------------------------------------------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------|
-| Llama 2 7B  | **441.09 ms**, 2.26 t/s<br><sub><sup>(I: 434.84 ms, T: 5.25 ms)</sup></sub> | **341.46 ms**, 2.92 t/s<br><sub><sup>(I: 257.78 ms, T: 83.27 ms)</sup></sub>   | **219.08 ms**, 4.56 t/s<br><sub><sup>(I: 163.42 ms, T: 55.25 ms)</sup></sub> |
-| Llama 3 8B  | **564.31 ms**, 1.77 t/s<br><sub><sup>(I: 556.67 ms, T: 6.17 ms)</sup></sub> | **444.27 ms**, 2.25 t/s<br><sub><sup>(I: 362.73 ms, T: 80.11 ms)</sup></sub>   | **331.47 ms**, 3.01 t/s<br><sub><sup>(I: 267.62 ms, T: 62.34 ms)</sup></sub> |
-
-<sub><sup>I - inference time of the root node, T - network transfer time, tested on 0.3.1 version</sup></sub>
+| Llama 2 7B  | **441.09 ms**, 2.26 t/s<br><sub><sup>I: 434.84 ms, T: 5.25 ms</sup></sub> | **341.46 ms**, 2.92 t/s<br><sub><sup>I: 257.78 ms, T: 83.27 ms</sup></sub>   | **219.08 ms**, 4.56 t/s 🔥<br><sub><sup>I: 163.42 ms, T: 55.25 ms</sup></sub> |
+| Llama 3 8B  | **564.31 ms**, 1.77 t/s<br><sub><sup>I: 556.67 ms, T: 6.17 ms</sup></sub> | **444.27 ms**, 2.25 t/s<br><sub><sup>I: 362.73 ms, T: 80.11 ms</sup></sub>   | **331.47 ms**, 3.01 t/s 🔥<br><sub><sup>I: 267.62 ms, T: 62.34 ms</sup></sub> |
 
 **Raspberry Pi 4B 8 GB**
 
+<sub><sup>Weights = Q40, Buffer = Q80, nSamples = 16, switch = TP-Link LS1008G, tested on 0.1.0 version</sup></sub>
+
 <p align="center">
   <img src=".github/8raspi2.jpg" width="35%" alt="8 x Raspberry Pi 4B 8GB" /><br />
   <sub><sup>8 x Raspberry Pi 4B 8GB</sup></sub>
 </p>
 
-All Raspberry Pi units were connected via Gigabit Ethernet to the TP-Link LS1008G Switch.
-
 | Model       | 1 x RasPi 4B 8 GB                                                   | 2 x RasPi 4B 8 GB                                                     | 4 x RasPi 4B 8 GB                                                                    | 8 x RasPi 4B 8 GB                                                    |
 |-------------|---------------------------------------------------------------------|-----------------------------------------------------------------------|--------------------------------------------------------------------------------------|----------------------------------------------------------------------|
-| Llama 2 7B  | **1312.50 ms**<br><sub><sup>(I: 1307.94 ms, T: 1.81 ms)</sup></sub> | **793.69 ms**<br><sub><sup>(I: 739.00 ms, T: 52.50 ms)</sup></sub>    | **494.00 ms** 🔥               <br><sub><sup>(I: 458.81 ms, T: 34.06 ms)</sup></sub> | **588.19 ms**<br><sub><sup>(I: 296.69 ms, T: 289.75 ms)</sup></sub>  |
-| Llama 2 13B | <sub><sup>Not enough RAM</sup></sub>                                | **1497.19 ms**<br><sub><sup>(I: 1465.06 ms, T: 30.88 ms)</sup></sub>  | **848.19 ms** 🔥<br><sub><sup>(I: 746.88 ms, T: 99.50 ms)</sup></sub>                | **1114.88 ms**<br><sub><sup>(I: 460.8 ms, T: 652.88 ms)</sup></sub>  |
-| Llama 2 70B | <sub><sup>Not enough RAM</sup></sub>                                | <sub><sup>Not enough RAM</sup></sub>                                  | <sub><sup>Not enough RAM</sup></sub>                                                 | **4842.81 ms** 🔥<br><sub><sup>(I: 2121.94 ms, T: 2719.62 ms)</sup></sub> |
-
-<sub><sup>I - inference time of the root node, T - network transfer time, tested on 0.1.0 version</sup></sub>
+| Llama 2 7B  | **1312.50 ms**<br><sub><sup>I: 1307.94 ms, T: 1.81 ms</sup></sub> | **793.69 ms**<br><sub><sup>I: 739.00 ms, T: 52.50 ms</sup></sub>    | **494.00 ms** 🔥               <br><sub><sup>I: 458.81 ms, T: 34.06 ms</sup></sub> | **588.19 ms**<br><sub><sup>I: 296.69 ms, T: 289.75 ms</sup></sub>  |
+| Llama 2 13B | <sub><sup>Not enough RAM</sup></sub>                                | **1497.19 ms**<br><sub><sup>I: 1465.06 ms, T: 30.88 ms</sup></sub>  | **848.19 ms** 🔥<br><sub><sup>I: 746.88 ms, T: 99.50 ms</sup></sub>                | **1114.88 ms**<br><sub><sup>I: 460.8 ms, T: 652.88 ms</sup></sub>  |
+| Llama 2 70B | <sub><sup>Not enough RAM</sup></sub>                                | <sub><sup>Not enough RAM</sup></sub>                                  | <sub><sup>Not enough RAM</sup></sub>                                                 | **4842.81 ms** 🔥<br><sub><sup>I: 2121.94 ms, T: 2719.62 ms</sup></sub> |
 
 **x86_64 CPU Cloud Server**
 
-All tests below were conducted on c3d-highcpu-30 (30 vCPU, 15 core, 59 GB memory) VMs in Google Cloud. [More details](https://github.com/b4rtaz/distributed-llama/discussions/9).
+<sub><sup>Weights = Q40, Buffer = Q80, nSamples = 16, VMs = [c3d-highcpu-30](https://github.com/b4rtaz/distributed-llama/discussions/9), tested on 0.1.0 version</sup></sub>
 
 | Model       | 1 x VM                                                              | 2 x VM                                                                | 4 x VM                                                                               |
 |-------------|---------------------------------------------------------------------|-----------------------------------------------------------------------|--------------------------------------------------------------------------------------|
-| Llama 2 7B  | **101.81 ms**<br><sub><sup>(I: 101.06 ms, T: 0.19 ms)</sup></sub>   | **69.69 ms**<br><sub><sup>(I: 61.50 ms, T: 7.62 ms)</sup></sub>       | **53.69 ms** 🔥<br><sub><sup>(I: 40.25 ms, T: 12.81 ms)</sup></sub>                  |
-| Llama 2 13B | **184.19 ms**<br><sub><sup>(I: 182.88 ms, T: 0.69 ms)</sup></sub>   | **115.38 ms**<br><sub><sup>(I: 107.12 ms, T: 7.81 ms)</sup></sub>     | **86.81 ms** 🔥<br><sub><sup>(I: 66.25 ms, T: 19.94 ms)</sup></sub>                  |
-| Llama 2 70B | **909.69 ms**<br><sub><sup>(I: 907.25 ms, T: 1.75 ms)</sup></sub>   | **501.38 ms**<br><sub><sup>(I: 475.50 ms, T: 25.00 ms)</sup></sub>    | **293.06 ms** 🔥<br><sub><sup>(I: 264.00 ms, T: 28.50 ms)</sup></sub>                  |
+| Llama 2 7B  | **101.81 ms**<br><sub><sup>I: 101.06 ms, T: 0.19 ms</sup></sub>   | **69.69 ms**<br><sub><sup>I: 61.50 ms, T: 7.62 ms</sup></sub>       | **53.69 ms** 🔥<br><sub><sup>I: 40.25 ms, T: 12.81 ms</sup></sub>                  |
+| Llama 2 13B | **184.19 ms**<br><sub><sup>I: 182.88 ms, T: 0.69 ms</sup></sub>   | **115.38 ms**<br><sub><sup>I: 107.12 ms, T: 7.81 ms</sup></sub>     | **86.81 ms** 🔥<br><sub><sup>I: 66.25 ms, T: 19.94 ms</sup></sub>                  |
+| Llama 2 70B | **909.69 ms**<br><sub><sup>I: 907.25 ms, T: 1.75 ms</sup></sub>   | **501.38 ms**<br><sub><sup>I: 475.50 ms, T: 25.00 ms</sup></sub>    | **293.06 ms** 🔥<br><sub><sup>I: 264.00 ms, T: 28.50 ms</sup></sub>                  |
 
-<sub><sup>I - inference time of the root node, T - network transfer time, tested on 0.1.0 version</sup></sub>
-
-### Network Transfer for Generating Single Token
+### Network Transfer for Generating Token
 
 **F32 Buffer**
 
-| Model       | 2 devices                                                        | 4 devices                                                        | 8 devices                                                        |
-|-------------|------------------------------------------------------------------|------------------------------------------------------------------|------------------------------------------------------------------|
-| Llama 3 8B  | **2048 kB**<br><sub><sup>(S: 1024 kB, R: 1024 kB)</sup></sub>    | **6144 kB**<br><sub><sup>(S: 3072 kB, R: 3072 kB)</sup></sub>   | **14336 kB**<br><sub><sup>(S: 7168 kB, R: 7168 kB)</sup></sub>  |
-
-<sub><sup>S - sent data from the root node to workers, R - received data by the root node from workers, tested on 0.7.1 version</sup></sub>
+| Model       | 2 devices      | 4 devices     | 8 devices     |
+|-------------|----------------|---------------|---------------|
+| Llama 3 8B  | **2048 kB**    | **6144 kB**   | **14336 kB**  |
 
 **Q80 Buffer**
 
-| Model       | 2 devices                                                     | 4 devices                                                      | 8 devices                                                       |
-|-------------|---------------------------------------------------------------|----------------------------------------------------------------|-----------------------------------------------------------------|
-| Llama 3 8B  | **544 kB**<br><sub><sup>(S: 272 kB, R: 272 kB)</sup></sub>   | **1632 kB**<br><sub><sup>(S: 816 kB, R: 816 kB)</sup></sub>   | **3808 kB**<br><sub><sup>(S: 1904 kB, R: 1904 kB)</sup></sub>    |
+| Model       | 2 devices    | 4 devices     | 8 devices      |
+|-------------|--------------|---------------|----------------|
+| Llama 3 8B  | **544 kB**   | **1632 kB**   | **3808 kB**    |
 
-<sub><sup>S - sent data from the root node to workers, R - received data by the root node from workers, tested on 0.7.1 version</sup></sub>
-
-## 📟 How to Run on Raspberry Pi Devices
+## 📟 Setup Raspberry Pi Devices
 
 1. Install `Raspberry Pi OS Lite (64 bit)` on your Raspberry Pi devices. This OS doesn't have desktop environment.
-2. Connect all devices to the Gigabit switch.
+2. Connect all devices to your switch or router.
 3. Connect to all devices via SSH.
 ```
 ssh user@raspberrypi1.local
@@ -127,27 +117,24 @@ ssh user@raspberrypi2.local
 ```sh
 sudo apt install git
 ```
-5. Clone this repository:
+5. Clone this repository and compile Distributed Llama on all devices:
 ```sh
 git clone https://github.com/b4rtaz/distributed-llama.git
-```
-6. Compile Distributed Llama:
-```sh
 make dllama
 ```
-7. Transfer weights and the tokenizer file to the root device.
-8. Optional: assign static IP addresses.
+6. Transfer weights and the tokenizer file to the root device.
+7. Optional: assign static IP addresses.
 ```sh
 sudo ip addr add 10.0.0.1/24 dev eth0 # 1th device
 sudo ip addr add 10.0.0.2/24 dev eth0 # 2th device
 ```
-9. Run worker nodes on worker devices:
+8. Run worker nodes on worker devices:
 ```sh
 sudo nice -n -20 ./dllama worker --port 9998 --nthreads 4
 ```
-10. Run root node on the root device:
+9. Run root node on the root device:
 ```sh
-sudo nice -n -20 ./dllama inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 10.0.0.2:9998
+sudo nice -n -20 ./dllama inference --model dllama_model_meta-llama-3-8b_q40.m --tokenizer dllama_tokenizer_llama3.t --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 10.0.0.2:9998
 ```
 
 To add more worker nodes, just add more addresses to the `--workers` argument.
@@ -156,70 +143,57 @@ To add more worker nodes, just add more addresses to the `--workers` argument.
 ./dllama inference ... --workers 10.0.0.2:9998 10.0.0.3:9998 10.0.0.4:9998
 ```
 
-[Share your results](https://github.com/b4rtaz/distributed-llama/discussions)!
+## 💻 Setup computers with MacOS, Linux, or Windows
 
-## 💻 How to Run on MacOS, Linux, or Windows
+You need x86_64 AVX2 CPUs or ARM CPUs. Different devices may have different CPUs.
 
-You need to have x86_64 AVX2 CPU or ARM CPU. Different devices may have different CPUs. The below instructions are for Debian-based distributions but you can easily adapt them to your distribution, macOS, or Windows.
+#### MacOS or Linux
 
-### MacOS and Linux
+The below instructions are for Debian-based distributions but you can easily adapt them to your distribution, macOS.
 
-1. Install Git and G++:
+1. Install Git and GCC:
 ```sh
 sudo apt install git build-essential
 ```
-2. Clone this repository:
+2. Clone this repository and compile Distributed Llama on all computers:
 ```sh
 git clone https://github.com/b4rtaz/distributed-llama.git
-```
-3. Compile Distributed Llama:
-```sh
 make dllama
 ```
-4. Transfer weights and the tokenizer file to the root node.
-5. Run worker nodes on worker devices:
-```sh
-sudo nice -n -20 ./dllama worker --port 9998 --nthreads 4
-```
-6. Run root node on the root device:
-```sh
-sudo nice -n -20 ./dllama inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 192.168.0.1:9998
-```
-7. To run the root node in the chat mode:
-```sh
-sudo nice -n -20 ./dllama chat --model ../dllama_llama-2-7b-chat_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --workers 192.168.0.1:9998
-```
 
-### Windows
+Continue to point 3.
 
-1. Install Git and Mingw (Chocolatey):
-  - https://chocolatey.org/install
+#### Windows
+
+1. Install Git and Mingw (via [Chocolatey](https://chocolatey.org/install)):
 ```powershell
 choco install mingw
 ```
-2. Clone this repository:
+2. Clone this repository and compile Distributed Llama on all computers:
 ```sh
 git clone https://github.com/b4rtaz/distributed-llama.git
-```
-3. Compile Distributed Llama:
-```sh
 make dllama
 ```
-4. Transfer weights and the tokenizer file to the root node.
-5. Run worker nodes on worker devices:
+
+Continue to point 3.
+
+#### Run Cluster
+
+3. Transfer weights and the tokenizer file to the root computer.
+4. Run worker nodes on worker computers:
 ```sh
 ./dllama worker --port 9998 --nthreads 4
 ```
-6. Run root node on the root device:
-```sh
-./dllama inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 192.168.0.1:9998
-```
-7. To run the root node in the chat mode:
+5. Run root node on the root computer:
 ```sh
-./dllama chat --model ../dllama_llama-2-7b-chat_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --workers 192.168.0.1:9998
+./dllama inference --model dllama_model_meta-llama-3-8b_q40.m --tokenizer dllama_tokenizer_llama3.t --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 192.168.0.1:9998
 ```
 
-[Share your results](https://github.com/b4rtaz/distributed-llama/discussions)!
+To add more worker nodes, just add more addresses to the `--workers` argument.
+
+```
+./dllama inference ... --workers 192.168.0.1:9998 192.168.0.2:9998 192.168.0.3:9998
+```
 
 ## 💡 License
 
diff --git a/converter/convert-llama.py b/converter/convert-llama.py
index 73a9b6f..31fae60 100644
--- a/converter/convert-llama.py
+++ b/converter/convert-llama.py
@@ -4,7 +4,7 @@
 import torch
 import math
 import numpy as np
-from writer import writeTensor, writeHeader, parseFloatType, FloatType
+from writer import writeTensor, writeHeader, parseFloatType, strFloatType, FloatType
 from pathlib import Path
 
 LAYER_CHUNK_SIZE = 48
@@ -107,12 +107,13 @@ def usage():
 
     modelPath = sys.argv[1]
     targetFloatType = parseFloatType(sys.argv[2])
+    targetFloatTypeStr = strFloatType(targetFloatType)
 
     modelName = os.path.basename(modelPath)
-    outputFileName = f'dllama_model_{modelName.lower()}_{sys.argv[2]}.m'
+    outputFileName = f'dllama_model_{modelName.lower()}_{targetFloatTypeStr}.m'
 
     print(f'Model name: {modelName}')
-    print(f'Target float type: {targetFloatType}')
+    print(f'Target float type: {targetFloatTypeStr}')
     print(f'Target file: {outputFileName}')
 
     convert(modelPath, outputFileName, targetFloatType)
diff --git a/converter/writer.py b/converter/writer.py
index 256eef5..56e6dd4 100644
--- a/converter/writer.py
+++ b/converter/writer.py
@@ -23,6 +23,9 @@ def parseFloatType(type):
         return floatType
     raise Exception(f'{type} is not supported')
 
+def strFloatType(type):
+    return floatTypeNames[type]
+
 def writeQuantizedQ40Tensor(file, x):
     x = x.to(torch.float32).numpy().astype(np.float32)
     blockSize = 32
@@ -105,7 +108,7 @@ def writeTensor(file, tensor, floatType):
     else:
         raise Exception(f'Unknown float type')
     t1 = time.time()
-    print(f'Saved {floatTypeNames[floatType]} tensor in {t1 - t0:.2f}s, {nBytes} bytes')
+    print(f'Saved {strFloatType(floatType)} tensor in {t1 - t0:.2f}s, {nBytes} bytes')
 
 def writeHeader(file, params):
     headerKeys = {
diff --git a/launch.py b/launch.py
index c8ebeb3..620270b 100644
--- a/launch.py
+++ b/launch.py
@@ -4,6 +4,11 @@
 
 # ['model-url', 'tokenizer-url', 'weights-float-type', 'buffer-float-type', 'model-type']
 MODELS = {
+    'llama3_8b_q40': [
+        'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_model_meta-llama-3-8b_q40.m?download=true',
+        'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_llama3.t?download=true',
+        'q40', 'q80', 'base'
+    ],
     'llama3_8b_instruct_q40': [
         'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_lama3_instruct_q40.m?download=true',
         'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3.t?download=true',
@@ -11,11 +16,12 @@
     ]
 }
 
-ALIASES = {
-    'llama3_instruct': 'llama3_8b_instruct_q40'
-}
-
 def downloadFile(url: str, path: str):
+    if (os.path.isfile(path)):
+        fileName = os.path.basename(path)
+        result = input(f'❓ {fileName} already exists, do you want to download again? ("Y" if yes): ')
+        if (result.upper() != 'Y'):
+            return
     response = requests.get(url, stream=True)
     response.raise_for_status()
     print(f'📄 {url}')
@@ -64,8 +70,6 @@ def printUsage():
     os.chdir(os.path.dirname(__file__))
 
     modelName = sys.argv[1].replace('-', '_')
-    if modelName in ALIASES:
-        modelName = ALIASES[modelName]
     if modelName not in MODELS:
         print(f'Model is not supported: {modelName}')
         exit(1)