支持huggingface tokenizers中的ByteLevel处理

TylunasLi · Feb 12, 2024 · ad7bf0d · ad7bf0d
1 parent 6498f58
commit ad7bf0d
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 0 deletions.
diff --git a/include/fastllm.h b/include/fastllm.h
@@ -17,6 +17,7 @@
 #include <iostream>
 #include <functional>
 #include <memory>
+#include <codecvt>
 #include "devices/cpu/cputhreadpool.h"
 
 #ifdef USE_SENTENCEPIECE
@@ -363,10 +364,15 @@ namespace fastllm {
 
         bool add_dummy_prefix = true;   // 是否在首位添加空格
         bool remove_extra_whitespaces = true;   // 是否将多个空格合并为一个
+        bool byte_as_char = false;  // 是否将byte变为展示字符
 
         std::unordered_map <int, std::string> tokenToStringDict;
         std::unordered_map <int, float> tokenToScoreDict;
         std::unordered_map <std::string, int> stringToTokenDict;
+
+        std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+        std::unordered_map <wchar_t, wchar_t> byteCharDict;
+        std::unordered_map <wchar_t, wchar_t> charByteDict;
 #ifdef USE_SENTENCEPIECE
         std::unique_ptr<sentencepiece::SentencePieceProcessor> spProcessor;
 #endif

diff --git a/src/fastllm.cpp b/src/fastllm.cpp
@@ -783,6 +783,18 @@ namespace fastllm {
 
     Tokenizer::Tokenizer() {
         root = new TrieNode();
+        int n = 0;
+        wchar_t special_token = L'\x0';
+        for (; special_token < L'!'; special_token++, n++) {
+            byteCharDict[L'\x100' + n] = special_token;
+            charByteDict[special_token] = L'\x100' + n;
+        }
+        for (special_token = L'\x7F'; special_token < L'\xA1'; special_token++, n++) {
+            byteCharDict[L'\x100' + n] = special_token;
+            charByteDict[special_token] = L'\x100' + n;
+        }
+        byteCharDict[L'\x100' + n++] = L'\xAD';
+        charByteDict[L'\xAD'] = L'\x100' + n++;
     }
 
     Tokenizer::~Tokenizer() {
@@ -799,8 +811,13 @@ namespace fastllm {
                 q.push_back(it.second);
             }
         }
+        for (TrieNode * node : q)
+            delete node;
+        q.clear();
         root = new TrieNode();
         tokenToStringDict.clear();
+        tokenToScoreDict.clear();
+        stringToTokenDict.clear();
     }
 
     void Tokenizer::Insert(const std::string &s, int tokenId, float score) {
@@ -851,6 +868,15 @@ namespace fastllm {
     }
 
     std::string Tokenizer::Normalize(const std::string &ori) {
+        if (this->byte_as_char) {
+            std::wstring ws = converter.from_bytes(ori);
+            for (int i=0; i < ws.length(); i++) {
+                if (charByteDict.find(ws[i]) != charByteDict.end()) {
+                    ws[i] = charByteDict[ws[i]];
+                }
+            }
+            return converter.to_bytes(ws);
+        }
         std::string blank = "";
         blank += 226, blank += 150, blank += 129;
         std::string s = this->add_dummy_prefix ? blank : "";
@@ -1232,6 +1258,15 @@ namespace fastllm {
                 ret.replace(pos, blank.length(), " ");
             else break;
         }
+        if (this->byte_as_char) {
+            std::wstring wret = converter.from_bytes(ret);
+            for (int i=0; i < wret.length(); i++) {
+                if (byteCharDict.find(wret[i]) != byteCharDict.end()) {
+                    wret[i] = byteCharDict[wret[i]];
+                }
+            }
+            ret = converter.to_bytes(wret);
+        }
         int pos = ret.find("<|blank_");
         if (pos != -1) {
             int space_num = atoi(ret.substr(8, ret.size() - 10).c_str());

diff --git a/src/model.cpp b/src/model.cpp
@@ -64,6 +64,12 @@ namespace fastllm {
             std::istringstream iss(value);
             iss >> std::boolalpha >> this->weight.tokenizer.remove_extra_whitespaces;
         }
+        if (this->weight.dicts.find("tokenizer_byte_as_char") != this->weight.dicts.end()) {
+            std::string value = this->weight.dicts["tokenizer_byte_as_char"];
+            transform(value.begin(), value.end(), value.begin(), ::tolower);
+            std::istringstream iss(value);
+            iss >> std::boolalpha >> this->weight.tokenizer.byte_as_char;
+        }
 
         this->deviceMap = GetDeviceMap();
     }

diff --git a/tools/fastllm_pytools/hf_model.py b/tools/fastllm_pytools/hf_model.py
@@ -2,6 +2,8 @@
 import ctypes;
 import numpy as np
 import torch
+from transformers import PreTrainedTokenizerFast
+from tokenizers.decoders import ByteLevel
 
 fastllm_data_type_dict = {
     "int4": 8,
@@ -74,6 +76,10 @@ def create(model,
                     modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces
             except:
                 pass
+        elif isinstance(tokenizer, PreTrainedTokenizerFast):
+            if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \
+                    and isinstance(tokenizer._tokenizer.decoder, ByteLevel):
+                modelInfo["tokenizer_byte_as_char"] = True
 
     peft_config = {}
     active_adapter = ""

diff --git a/tools/fastllm_pytools/torch2flm.py b/tools/fastllm_pytools/torch2flm.py
@@ -1,6 +1,8 @@
 import struct
 import numpy as np
 import torch
+from transformers import PreTrainedTokenizerFast
+from tokenizers.decoders import ByteLevel
 
 def writeString(fo, s):
     fo.write(struct.pack('i', len(s)))
@@ -131,6 +133,10 @@ def tofile(exportPath,
                     modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces
             except:
                 pass
+        elif isinstance(tokenizer, PreTrainedTokenizerFast):
+            if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \
+                    and isinstance(tokenizer._tokenizer.decoder, ByteLevel):
+                modelInfo["tokenizer_byte_as_char"] = True
 
     if hasattr(model, "peft_config"):
         adapter_size = len(model.peft_config)