diff --git a/include/fastllm.h b/include/fastllm.h index 481cb511..e1405a6e 100644 --- a/include/fastllm.h +++ b/include/fastllm.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "devices/cpu/cputhreadpool.h" #ifdef USE_SENTENCEPIECE @@ -363,10 +364,15 @@ namespace fastllm { bool add_dummy_prefix = true; // 是否在首位添加空格 bool remove_extra_whitespaces = true; // 是否将多个空格合并为一个 + bool byte_as_char = false; // 是否将byte变为展示字符 std::unordered_map tokenToStringDict; std::unordered_map tokenToScoreDict; std::unordered_map stringToTokenDict; + + std::wstring_convert> converter; + std::unordered_map byteCharDict; + std::unordered_map charByteDict; #ifdef USE_SENTENCEPIECE std::unique_ptr spProcessor; #endif diff --git a/src/fastllm.cpp b/src/fastllm.cpp index 078c1a8b..1a854ca4 100644 --- a/src/fastllm.cpp +++ b/src/fastllm.cpp @@ -783,6 +783,18 @@ namespace fastllm { Tokenizer::Tokenizer() { root = new TrieNode(); + int n = 0; + wchar_t special_token = L'\x0'; + for (; special_token < L'!'; special_token++, n++) { + byteCharDict[L'\x100' + n] = special_token; + charByteDict[special_token] = L'\x100' + n; + } + for (special_token = L'\x7F'; special_token < L'\xA1'; special_token++, n++) { + byteCharDict[L'\x100' + n] = special_token; + charByteDict[special_token] = L'\x100' + n; + } + byteCharDict[L'\x100' + n++] = L'\xAD'; + charByteDict[L'\xAD'] = L'\x100' + n++; } Tokenizer::~Tokenizer() { @@ -799,8 +811,13 @@ namespace fastllm { q.push_back(it.second); } } + for (TrieNode * node : q) + delete node; + q.clear(); root = new TrieNode(); tokenToStringDict.clear(); + tokenToScoreDict.clear(); + stringToTokenDict.clear(); } void Tokenizer::Insert(const std::string &s, int tokenId, float score) { @@ -851,6 +868,15 @@ namespace fastllm { } std::string Tokenizer::Normalize(const std::string &ori) { + if (this->byte_as_char) { + std::wstring ws = converter.from_bytes(ori); + for (int i=0; i < ws.length(); i++) { + if (charByteDict.find(ws[i]) != charByteDict.end()) { + ws[i] = charByteDict[ws[i]]; + } + } + return converter.to_bytes(ws); + } std::string blank = ""; blank += 226, blank += 150, blank += 129; std::string s = this->add_dummy_prefix ? blank : ""; @@ -1232,6 +1258,15 @@ namespace fastllm { ret.replace(pos, blank.length(), " "); else break; } + if (this->byte_as_char) { + std::wstring wret = converter.from_bytes(ret); + for (int i=0; i < wret.length(); i++) { + if (byteCharDict.find(wret[i]) != byteCharDict.end()) { + wret[i] = byteCharDict[wret[i]]; + } + } + ret = converter.to_bytes(wret); + } int pos = ret.find("<|blank_"); if (pos != -1) { int space_num = atoi(ret.substr(8, ret.size() - 10).c_str()); diff --git a/src/model.cpp b/src/model.cpp index 6ea70934..85c0b9cd 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -64,6 +64,12 @@ namespace fastllm { std::istringstream iss(value); iss >> std::boolalpha >> this->weight.tokenizer.remove_extra_whitespaces; } + if (this->weight.dicts.find("tokenizer_byte_as_char") != this->weight.dicts.end()) { + std::string value = this->weight.dicts["tokenizer_byte_as_char"]; + transform(value.begin(), value.end(), value.begin(), ::tolower); + std::istringstream iss(value); + iss >> std::boolalpha >> this->weight.tokenizer.byte_as_char; + } this->deviceMap = GetDeviceMap(); } diff --git a/tools/fastllm_pytools/hf_model.py b/tools/fastllm_pytools/hf_model.py index 47e512b2..735bdccd 100644 --- a/tools/fastllm_pytools/hf_model.py +++ b/tools/fastllm_pytools/hf_model.py @@ -2,6 +2,8 @@ import ctypes; import numpy as np import torch +from transformers import PreTrainedTokenizerFast +from tokenizers.decoders import ByteLevel fastllm_data_type_dict = { "int4": 8, @@ -74,6 +76,10 @@ def create(model, modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces except: pass + elif isinstance(tokenizer, PreTrainedTokenizerFast): + if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \ + and isinstance(tokenizer._tokenizer.decoder, ByteLevel): + modelInfo["tokenizer_byte_as_char"] = True peft_config = {} active_adapter = "" diff --git a/tools/fastllm_pytools/torch2flm.py b/tools/fastllm_pytools/torch2flm.py index cf3d1dbb..7ae6e333 100644 --- a/tools/fastllm_pytools/torch2flm.py +++ b/tools/fastllm_pytools/torch2flm.py @@ -1,6 +1,8 @@ import struct import numpy as np import torch +from transformers import PreTrainedTokenizerFast +from tokenizers.decoders import ByteLevel def writeString(fo, s): fo.write(struct.pack('i', len(s))) @@ -131,6 +133,10 @@ def tofile(exportPath, modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces except: pass + elif isinstance(tokenizer, PreTrainedTokenizerFast): + if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \ + and isinstance(tokenizer._tokenizer.decoder, ByteLevel): + modelInfo["tokenizer_byte_as_char"] = True if hasattr(model, "peft_config"): adapter_size = len(model.peft_config)