Skip to content

Commit

Permalink
feat: add llama3_1_405b_instruct_q40 to launch.py. (#112)
Browse files Browse the repository at this point in the history
  • Loading branch information
b4rtaz authored Jul 31, 2024
1 parent ee2c689 commit 5244daa
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 25 deletions.
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ Tensor parallelism is all you need. Run LLMs on weak devices or make powerful de

Python 3 and C++ compiler required. The command will download the model and the tokenizer.

| Model | Purpose | Size | Command |
| ------------------------- | --------- | -------- | ------------------------------------------- |
| TinyLlama 1.1B 3T Q40 | Benchmark | 844 MB | `python launch.py tinyllama_1_1b_3t_q40` |
| Llama 3 8B Q40 | Benchmark | 6.32 GB | `python launch.py llama3_8b_q40` |
| Llama 3 8B Instruct Q40 | Chat, API | 6.32 GB | `python launch.py llama3_8b_instruct_q40` |
| Llama 3.1 8B Instruct Q40 | Chat, API | 6.32 GB | `python launch.py llama3_1_8b_instruct_q40` |
| Model | Purpose | Size | Command |
| --------------------------- | --------- | -------- | --------------------------------------------- |
| TinyLlama 1.1B 3T Q40 | Benchmark | 844 MB | `python launch.py tinyllama_1_1b_3t_q40` |
| Llama 3 8B Q40 | Benchmark | 6.32 GB | `python launch.py llama3_8b_q40` |
| Llama 3 8B Instruct Q40 | Chat, API | 6.32 GB | `python launch.py llama3_8b_instruct_q40` |
| Llama 3.1 8B Instruct Q40 | Chat, API | 6.32 GB | `python launch.py llama3_1_8b_instruct_q40` |
| Llama 3.1 405B Instruct Q40 | Chat, API | 238 GB | `python launch.py llama3_1_405b_instruct_q40` |

### 🛠️ Convert Model Manually

Expand Down
54 changes: 35 additions & 19 deletions launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,59 +2,75 @@
import sys
import requests

# ['model-url', 'tokenizer-url', 'weights-float-type', 'buffer-float-type', 'model-type']
def parts(length):
result = []
for i in range(length):
a = chr(97 + (i // 26))
b = chr(97 + (i % 26))
result.append(a + b)
return result

# [['model-url-0', 'model-url-1', ...], 'tokenizer-url', 'weights-float-type', 'buffer-float-type', 'model-type']
MODELS = {
'tinyllama_1_1b_3t_q40': [
'https://huggingface.co/b4rtaz/TinyLlama-1.1B-3T-Distributed-Llama/resolve/main/dllama_model_tinylama_1.1b_3t_q40.m?download=true',
['https://huggingface.co/b4rtaz/TinyLlama-1.1B-3T-Distributed-Llama/resolve/main/dllama_model_tinylama_1.1b_3t_q40.m?download=true'],
'https://huggingface.co/b4rtaz/TinyLlama-1.1B-3T-Distributed-Llama/resolve/main/dllama_tokenizer_tinylama_1.1b_3t.t?download=true',
'q40', 'q80', 'base'
],
'llama3_8b_q40': [
'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_model_meta-llama-3-8b_q40.m?download=true',
['https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_model_meta-llama-3-8b_q40.m?download=true'],
'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_llama3.t?download=true',
'q40', 'q80', 'base'
],
'llama3_8b_instruct_q40': [
'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_lama3_instruct_q40.m?download=true',
['https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_lama3_instruct_q40.m?download=true'],
'https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3.t?download=true',
'q40', 'q80', 'chat'
],
'llama3_1_8b_instruct_q40': [
'https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.1_instruct_q40.m?download=true',
['https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.1_instruct_q40.m?download=true'],
'https://huggingface.co/b4rtaz/Llama-3_1-8B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama_3_1.t?download=true',
'q40', 'q80', 'chat'
],
'llama3_1_405b_instruct_q40': [
list(map(lambda suffix : f'https://huggingface.co/b4rtaz/Llama-3_1-405B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama31_405b_q40_{suffix}?download=true', parts(56))),
'https://huggingface.co/b4rtaz/Llama-3_1-405B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama_3_1.t?download=true',
'q40', 'q80', 'chat'
],
}

def downloadFile(url: str, path: str):
def downloadFile(urls: str, path: str):
if (os.path.isfile(path)):
fileName = os.path.basename(path)
result = input(f'❓ {fileName} already exists, do you want to download again? ("Y" if yes): ')
if (result.upper() != 'Y'):
return
response = requests.get(url, stream=True)
response.raise_for_status()
print(f'📄 {url}')
lastSize = 0

lastSizeMb = 0
with open(path, 'wb') as file:
for chunk in response.iter_content(chunk_size=4096):
file.write(chunk)
size = file.tell() // 1024
if (size - lastSize >= 8192):
sys.stdout.write("\rDownloaded %i kB" % size)
lastSize = size
for url in urls:
print(f'📄 {url}')
response = requests.get(url, stream=True)
response.raise_for_status()
for chunk in response.iter_content(chunk_size=4096):
file.write(chunk)
sizeMb = file.tell() // (1024 * 1024)
if (sizeMb != lastSizeMb):
sys.stdout.write("\rDownloaded %i MB" % sizeMb)
lastSizeMb = sizeMb
sys.stdout.write('\n')
sys.stdout.write(' ✅\n')

def download(modelName: str, model: list):
dirPath = os.path.join('models', modelName)
print(f'📀 Downloading {modelName} to {dirPath}...')
os.makedirs(dirPath, exist_ok=True)
modelUrl = model[0]
modelUrls = model[0]
tokenizerUrl = model[1]
modelPath = os.path.join(dirPath, f'dllama_model_{modelName}.m')
tokenizerPath = os.path.join(dirPath, f'dllama_tokenizer_{modelName}.t')
downloadFile(modelUrl, modelPath)
downloadFile(tokenizerUrl, tokenizerPath)
downloadFile(modelUrls, modelPath)
downloadFile([tokenizerUrl], tokenizerPath)
print('📀 All files are downloaded')
return (modelPath, tokenizerPath)

Expand Down

0 comments on commit 5244daa

Please sign in to comment.