Skip to content

Commit

Permalink
feat: tokenizer v1. (#83)
Browse files Browse the repository at this point in the history
  • Loading branch information
b4rtaz authored Jun 1, 2024
1 parent 916d696 commit a5e0445
Show file tree
Hide file tree
Showing 11 changed files with 239 additions and 241 deletions.
114 changes: 73 additions & 41 deletions converter/convert-tokenizer-hf.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,103 @@
import sys
import json
import os
from sentencepiece import SentencePieceProcessor
writer = __import__('tokenizer-writer')

def openJson(path):
with open(path, 'r', encoding='utf-8') as file:
return json.load(file)

class TokensResolver:
def __init__(self, dirPath, tokenizerConfig):
self.dirPath = dirPath
self.tokenizerConfig = tokenizerConfig
self.bosId = None
self.eosId = None
self.tokens = []
self.scores = []

def resolvePreTrainedTokenizerFast(self):
tokenizer = openJson(os.path.join(self.dirPath, 'tokenizer.json'))
assert(tokenizer['model']['type'] == 'BPE')

i = 0
for token in tokenizer['model']['vocab'].keys():
assert(tokenizer['model']['vocab'][token] == i)
self.tokens.append(token.encode('utf8'))
self.scores.append(-float(i))
i += 1
if ('added_tokens' in tokenizer):
for at in tokenizer['added_tokens']:
assert(at['id'] == i)
self.tokens.append(at['content'].encode('utf8'))
self.scores.append(-float(i))
if (at['content'] == self.tokenizerConfig['bos_token']):
self.bosId = i
if (at['content'] == self.tokenizerConfig['eos_token']):
self.eosId = i
i += 1

def resolveLlamaTokenizer(self):
modelPath = os.path.join(self.dirPath, 'tokenizer.model')
processor = SentencePieceProcessor(model_file=modelPath)

assert processor.vocab_size() == processor.get_piece_size()
self.bosId = processor.bos_id()
self.eosId = processor.eos_id()

vocabSize = processor.vocab_size()
for i in range(vocabSize):
t = processor.id_to_piece(i)
s = processor.get_score(i)
t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
b = t.encode('utf-8')
self.tokens.append(b)
self.scores.append(s)

def resolve(self):
cls = self.tokenizerConfig['tokenizer_class']
if (cls == 'PreTrainedTokenizerFast'):
return self.resolvePreTrainedTokenizerFast()
if (cls == 'LlamaTokenizer'):
return self.resolveLlamaTokenizer()
raise Exception(f'Tokenizer {cls} is not supported')

def printUsage():
print('Usage: python convert-tokenizer-hf.py <tokenizerFolderPath> <name>')
print()
print('Options:')
print(' <tokenizerFolderPath> The path to the folder with tokenizer.json and tokenizer_config.json')
print(' <tokenizerFolderPath> The path to the folder with tokenizer_config.json')
print(' <name> The name of the tokenizer (e.g. "llama3")')

if __name__ == '__main__':
if (len(sys.argv) < 2):
printUsage()
exit(1)

dirPath = sys.argv[1]
name = sys.argv[2]
tokenizerConfig = openJson(os.path.join(dirPath, 'tokenizer_config.json'))
tokenizer = openJson(os.path.join(dirPath, 'tokenizer.json'))

assert(tokenizerConfig['tokenizer_class'] == 'PreTrainedTokenizerFast')
assert(tokenizer['model']['type'] == 'BPE')
i = 0
tokens = []
scores = []
bosId = None
eosId = None
for token in tokenizer['model']['vocab'].keys():
assert(tokenizer['model']['vocab'][token] == i)
tokens.append(token.encode('utf8'))
scores.append(-float(i))
i += 1
if ('added_tokens' in tokenizer):
for at in tokenizer['added_tokens']:
assert(at['id'] == i)
tokens.append(at['content'].encode('utf8'))
scores.append(-float(i))
if (at['content'] == tokenizerConfig['bos_token']):
bosId = i
if (at['content'] == tokenizerConfig['eos_token']):
eosId = i
i += 1
resolver = TokensResolver(dirPath, tokenizerConfig)
resolver.resolve()

print(f'bosId: {resolver.bosId} ({resolver.tokens[resolver.bosId]})')
print(f'eosId: {resolver.eosId} ({resolver.tokens[resolver.eosId]})')

templateChat = None
chatTemplate = None
chatExtraStop = None
if ('chat_template' in tokenizerConfig):
template = tokenizerConfig['chat_template']
print('⭐ Found chat template:')
print()
print(template.replace('\n', '\\n'))
print()
print('⭐ To create the tokenizer file you need to manually specify chat template values. Enter \\n for new line.')
templateChat = {}
templateKeys = ['chat_message_start', 'chat_role_start', 'chat_role_end', 'chat_message_end', 'chat_generation_prompt', 'chat_extra_stop']
for key in templateKeys:
value = input(f'⏩ Enter value for chat template key "{key}":\n')
templateChat[key] = value.replace('\\n', '\n')
chatTemplate = tokenizerConfig['chat_template'].encode('utf-8')
input = input('⏩ Enter value for chat extra stop (enter to skip): ')
if (input != ''):
chatExtraStop = input.encode('utf-8')

outputFileName = f'dllama_tokenizer_{name}.t'
with open(outputFileName, 'wb') as outputFile:
writer.writeTokenizer(outputFile, {
'bos_id': bosId,
'eos_id': eosId,
'chat_eos_id': eosId,
}, templateChat, tokens, scores)
'bos_id': resolver.bosId,
'eos_id': resolver.eosId,
'chat_eos_id': resolver.eosId,
}, resolver.tokens, resolver.scores, chatTemplate, chatExtraStop)
print(f'✅ Created {outputFileName}')
11 changes: 2 additions & 9 deletions converter/convert-tokenizer-llama3.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,7 @@
bosId = 128000
eosId = 128001
chatEosId = 128009
chatTemplate = {
'chat_message_start': '',
'chat_role_start': '<|start_header_id|>',
'chat_role_end': '<|end_header_id|>\n\n',
'chat_message_end': '<|eot_id|>',
'chat_generation_prompt': '<|start_header_id|>assistant<|end_header_id|>\n\n',
'chat_extra_stop': ''
}
chatTemplate = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"

def printUsage():
print('Usage: python convert-tokenizer-llama3.py <tokenizerPath>')
Expand Down Expand Up @@ -79,6 +72,6 @@ def printUsage():
'bos_id': bosId,
'eos_id': eosId,
'chat_eos_id': chatEosId,
}, chatTemplate, tokens, scores)
}, tokens, scores, chatTemplate.encode('utf-8'), None)

print(f'✅ Created {outputFileName}')
86 changes: 0 additions & 86 deletions converter/convert-tokenizer-sentencepiece.py

This file was deleted.

34 changes: 18 additions & 16 deletions converter/tokenizer-writer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import struct

def writeTokenizer(file, params, chatTemplate, tokens, scores):
def writeTokenizer(file, params, tokens, scores, chatTemplate, chatExtraStop):
assert(params['eos_id'] is not None)
assert(params['bos_id'] is not None)

Expand All @@ -12,43 +12,45 @@ def writeTokenizer(file, params, chatTemplate, tokens, scores):
'eos_id': 4,
'pad_id': 5,
'chat_eos_id': 6,
'chat_template': 7
'chat_template': 7,
'chat_stop': 8
}
header = struct.pack('i', 0x567124)

nTokens = len(tokens)
maxTokenLength = max(len(t) for t in tokens)

params['version'] = 0
params['version'] = 1
params['vocab_size'] = nTokens
params['max_token_length'] = maxTokenLength
if (chatTemplate):
params['chat_template'] = len(chatTemplate)
if (chatExtraStop):
params['chat_stop'] = len(chatExtraStop)

data = b''
for key in params:
value = params[key]
if value is None:
continue
if key in headerKeys:
data += struct.pack('ii', headerKeys[key], params[key])
else:
print(f'Unknown header key: {key}')

header += struct.pack('i', len(header) * 2 + len(data))
file.write(header)
file.write(data)

print('⭐ Params:')
print(params)
if (chatTemplate):
print('⭐ Chat template:')
print(chatTemplate)

if (chatTemplate):
chatTemplateValue = list(chatTemplate.values())
nChatTemplates = len(chatTemplateValue)
for i in range(0, nChatTemplates):
file.write(struct.pack('I', len(chatTemplateValue[i].encode('utf8'))))
for i in range(0, nChatTemplates):
data = chatTemplateValue[i].encode('utf8')
if (len(data) > 0):
file.write(data)
header += struct.pack('i', len(header) * 2 + len(data))
file.write(header)
file.write(data)
if chatTemplate:
file.write(chatTemplate)
if chatExtraStop:
file.write(chatExtraStop)

for i in range(0, nTokens):
size = len(tokens[i])
Expand Down
6 changes: 3 additions & 3 deletions src/apps/dllama-api/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# Distributed Llama API

This is an early version of the server that is compatible with the OpenAi API. It supports only the `/v1/chat/completions` endpoint. Currently it's adjusted to the Llama 3 8B Instruct only.
This is an early version of the server that is compatible with the OpenAi API. It supports only the `/v1/chat/completions` endpoint. To run this server you need a chat model and a tokenizer with the chat support.

How to run?

1. Download the model and the tokenizer from [here](https://huggingface.co/Azamorn/Meta-Llama-3-8B-Instruct-Distributed).
1. Download the model and the tokenizer from [here](https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama).
2. Run the server with the following command:
```bash
./dllama-api --model converter/dllama_original_q40.bin --tokenizer converter/dllama-llama3-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4
./dllama-api --model converter/dllama_model_lama3_instruct_q40.m --tokenizer converter/dllama_tokenizer_llama3.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4
```

Check the [chat-api-client.js](../../../examples/chat-api-client.js) file to see how to use the API from NodeJS application.
Loading

0 comments on commit a5e0445

Please sign in to comment.