feat: tokenizer v1. (#83)

b4rtaz · Jun 1, 2024 · a5e0445 · a5e0445
1 parent 916d696
commit a5e0445
Show file tree

Hide file tree

Showing 11 changed files with 239 additions and 241 deletions.
diff --git a/converter/convert-tokenizer-hf.py b/converter/convert-tokenizer-hf.py
@@ -1,71 +1,103 @@
 import sys
 import json
 import os
+from sentencepiece import SentencePieceProcessor
 writer = __import__('tokenizer-writer')
 
 def openJson(path):
     with open(path, 'r', encoding='utf-8') as file:
         return json.load(file)
 
+class TokensResolver:
+    def __init__(self, dirPath, tokenizerConfig):
+        self.dirPath = dirPath
+        self.tokenizerConfig = tokenizerConfig
+        self.bosId = None
+        self.eosId = None
+        self.tokens = []
+        self.scores = []
+
+    def resolvePreTrainedTokenizerFast(self):
+        tokenizer = openJson(os.path.join(self.dirPath, 'tokenizer.json'))
+        assert(tokenizer['model']['type'] == 'BPE')
+
+        i = 0
+        for token in tokenizer['model']['vocab'].keys():
+            assert(tokenizer['model']['vocab'][token] == i)
+            self.tokens.append(token.encode('utf8'))
+            self.scores.append(-float(i))
+            i += 1
+        if ('added_tokens' in tokenizer):
+            for at in tokenizer['added_tokens']:
+                assert(at['id'] == i)
+                self.tokens.append(at['content'].encode('utf8'))
+                self.scores.append(-float(i))
+                if (at['content'] == self.tokenizerConfig['bos_token']):
+                    self.bosId = i
+                if (at['content'] == self.tokenizerConfig['eos_token']):
+                    self.eosId = i
+                i += 1
+
+    def resolveLlamaTokenizer(self):
+        modelPath = os.path.join(self.dirPath, 'tokenizer.model')
+        processor = SentencePieceProcessor(model_file=modelPath)
+
+        assert processor.vocab_size() == processor.get_piece_size()
+        self.bosId = processor.bos_id()
+        self.eosId = processor.eos_id()
+
+        vocabSize = processor.vocab_size()
+        for i in range(vocabSize):
+            t = processor.id_to_piece(i)
+            s = processor.get_score(i)
+            t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
+            b = t.encode('utf-8')
+            self.tokens.append(b)
+            self.scores.append(s)
+
+    def resolve(self):
+        cls = self.tokenizerConfig['tokenizer_class']
+        if (cls == 'PreTrainedTokenizerFast'):
+            return self.resolvePreTrainedTokenizerFast()
+        if (cls == 'LlamaTokenizer'):
+            return self.resolveLlamaTokenizer()
+        raise Exception(f'Tokenizer {cls} is not supported')
+
 def printUsage():
     print('Usage: python convert-tokenizer-hf.py <tokenizerFolderPath> <name>')
     print()
     print('Options:')
-    print('  <tokenizerFolderPath> The path to the folder with tokenizer.json and tokenizer_config.json')
+    print('  <tokenizerFolderPath> The path to the folder with tokenizer_config.json')
     print('  <name>                The name of the tokenizer (e.g. "llama3")')
 
 if __name__ == '__main__':
     if (len(sys.argv) < 2):
         printUsage()
         exit(1)
-    
+
     dirPath = sys.argv[1]
     name = sys.argv[2]
     tokenizerConfig = openJson(os.path.join(dirPath, 'tokenizer_config.json'))
-    tokenizer = openJson(os.path.join(dirPath, 'tokenizer.json'))
 
-    assert(tokenizerConfig['tokenizer_class'] == 'PreTrainedTokenizerFast')
-    assert(tokenizer['model']['type'] == 'BPE')
-    i = 0
-    tokens = []
-    scores = []
-    bosId = None
-    eosId = None
-    for token in tokenizer['model']['vocab'].keys():
-        assert(tokenizer['model']['vocab'][token] == i)
-        tokens.append(token.encode('utf8'))
-        scores.append(-float(i))
-        i += 1
-    if ('added_tokens' in tokenizer):
-        for at in tokenizer['added_tokens']:
-            assert(at['id'] == i)
-            tokens.append(at['content'].encode('utf8'))
-            scores.append(-float(i))
-            if (at['content'] == tokenizerConfig['bos_token']):
-                bosId = i
-            if (at['content'] == tokenizerConfig['eos_token']):
-                eosId = i
-            i += 1
+    resolver = TokensResolver(dirPath, tokenizerConfig)
+    resolver.resolve()
+
+    print(f'bosId: {resolver.bosId} ({resolver.tokens[resolver.bosId]})')
+    print(f'eosId: {resolver.eosId} ({resolver.tokens[resolver.eosId]})')
 
-    templateChat = None
+    chatTemplate = None
+    chatExtraStop = None
     if ('chat_template' in tokenizerConfig):
-        template = tokenizerConfig['chat_template']
-        print('⭐ Found chat template:')
-        print()
-        print(template.replace('\n', '\\n'))
-        print()
-        print('⭐ To create the tokenizer file you need to manually specify chat template values. Enter \\n for new line.')
-        templateChat = {}
-        templateKeys = ['chat_message_start', 'chat_role_start', 'chat_role_end', 'chat_message_end', 'chat_generation_prompt', 'chat_extra_stop']
-        for key in templateKeys:
-            value = input(f'⏩ Enter value for chat template key "{key}":\n')
-            templateChat[key] = value.replace('\\n', '\n')
+        chatTemplate = tokenizerConfig['chat_template'].encode('utf-8')
+        input = input('⏩ Enter value for chat extra stop (enter to skip): ')
+        if (input != ''):
+            chatExtraStop = input.encode('utf-8')
 
     outputFileName = f'dllama_tokenizer_{name}.t'
     with open(outputFileName, 'wb') as outputFile:
         writer.writeTokenizer(outputFile, {
-            'bos_id': bosId,
-            'eos_id': eosId,
-            'chat_eos_id': eosId,
-        }, templateChat, tokens, scores)
+            'bos_id': resolver.bosId,
+            'eos_id': resolver.eosId,
+            'chat_eos_id': resolver.eosId,
+        }, resolver.tokens, resolver.scores, chatTemplate, chatExtraStop)
     print(f'✅ Created {outputFileName}')
diff --git a/converter/convert-tokenizer-llama3.py b/converter/convert-tokenizer-llama3.py
@@ -30,14 +30,7 @@
 bosId = 128000
 eosId = 128001
 chatEosId = 128009
-chatTemplate = {
-    'chat_message_start': '',
-    'chat_role_start': '<|start_header_id|>',
-    'chat_role_end': '<|end_header_id|>\n\n',
-    'chat_message_end': '<|eot_id|>',
-    'chat_generation_prompt': '<|start_header_id|>assistant<|end_header_id|>\n\n',
-    'chat_extra_stop': ''
-}
+chatTemplate = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
 
 def printUsage():
     print('Usage: python convert-tokenizer-llama3.py <tokenizerPath>')
@@ -79,6 +72,6 @@ def printUsage():
                 'bos_id': bosId,
                 'eos_id': eosId,
                 'chat_eos_id': chatEosId,
-            }, chatTemplate, tokens, scores)
+            }, tokens, scores, chatTemplate.encode('utf-8'), None)
 
     print(f'✅ Created {outputFileName}')
diff --git a/converter/convert-tokenizer-sentencepiece.py b/converter/convert-tokenizer-sentencepiece.py
diff --git a/converter/tokenizer-writer.py b/converter/tokenizer-writer.py
@@ -1,6 +1,6 @@
 import struct
 
-def writeTokenizer(file, params, chatTemplate, tokens, scores):
+def writeTokenizer(file, params, tokens, scores, chatTemplate, chatExtraStop):
     assert(params['eos_id'] is not None)
     assert(params['bos_id'] is not None)
 
@@ -12,43 +12,45 @@ def writeTokenizer(file, params, chatTemplate, tokens, scores):
         'eos_id': 4,
         'pad_id': 5,
         'chat_eos_id': 6,
-        'chat_template': 7
+        'chat_template': 7,
+        'chat_stop': 8
     }
     header = struct.pack('i', 0x567124)
 
     nTokens = len(tokens)
     maxTokenLength = max(len(t) for t in tokens)
 
-    params['version'] = 0
+    params['version'] = 1
     params['vocab_size'] = nTokens
     params['max_token_length'] = maxTokenLength
     if (chatTemplate):
         params['chat_template'] = len(chatTemplate)
+    if (chatExtraStop):
+        params['chat_stop'] = len(chatExtraStop)
 
     data = b''
     for key in params:
+        value = params[key]
+        if value is None:
+            continue
         if key in headerKeys:
             data += struct.pack('ii', headerKeys[key], params[key])
         else:
             print(f'Unknown header key: {key}')
 
-    header += struct.pack('i', len(header) * 2 + len(data))
-    file.write(header)
-    file.write(data)
-
+    print('⭐ Params:')
     print(params)
     if (chatTemplate):
+        print('⭐ Chat template:')
         print(chatTemplate)
 
-    if (chatTemplate):
-        chatTemplateValue = list(chatTemplate.values())
-        nChatTemplates = len(chatTemplateValue)
-        for i in range(0, nChatTemplates):
-            file.write(struct.pack('I', len(chatTemplateValue[i].encode('utf8'))))
-        for i in range(0, nChatTemplates):
-            data = chatTemplateValue[i].encode('utf8')
-            if (len(data) > 0):
-                file.write(data)
+    header += struct.pack('i', len(header) * 2 + len(data))
+    file.write(header)
+    file.write(data)
+    if chatTemplate:
+        file.write(chatTemplate)
+    if chatExtraStop:
+        file.write(chatExtraStop)
 
     for i in range(0, nTokens):
         size = len(tokens[i])

diff --git a/src/apps/dllama-api/README.md b/src/apps/dllama-api/README.md
@@ -1,13 +1,13 @@
 # Distributed Llama API
 
-This is an early version of the server that is compatible with the OpenAi API. It supports only the `/v1/chat/completions` endpoint. Currently it's adjusted to the Llama 3 8B Instruct only.
+This is an early version of the server that is compatible with the OpenAi API. It supports only the `/v1/chat/completions` endpoint. To run this server you need a chat model and a tokenizer with the chat support.
 
 How to run?
 
-1. Download the model and the tokenizer from [here](https://huggingface.co/Azamorn/Meta-Llama-3-8B-Instruct-Distributed).
+1. Download the model and the tokenizer from [here](https://huggingface.co/b4rtaz/Llama-3-8B-Q40-Instruct-Distributed-Llama).
 2. Run the server with the following command:
 ```bash
-./dllama-api --model converter/dllama_original_q40.bin --tokenizer converter/dllama-llama3-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4
+./dllama-api --model converter/dllama_model_lama3_instruct_q40.m --tokenizer converter/dllama_tokenizer_llama3.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4
 ```
 
 Check the [chat-api-client.js](../../../examples/chat-api-client.js) file to see how to use the API from NodeJS application.