Skip to content

Commit

Permalink
tokenizer中的特殊token增加序列化与反序列化机制
Browse files Browse the repository at this point in the history
  • Loading branch information
cgli authored and TylunasLi committed Feb 16, 2024
1 parent 5c06d27 commit 7062e85
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 37 deletions.
1 change: 1 addition & 0 deletions include/fastllm.h
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ namespace fastllm {
std::unordered_map <int, std::string> tokenToStringDict;
std::unordered_map <int, float> tokenToScoreDict;
std::unordered_map <std::string, int> stringToTokenDict;
std::vector <std::string> specialTokens;

std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
std::unordered_map <wchar_t, wchar_t> byteCharDict;
Expand Down
95 changes: 60 additions & 35 deletions src/fastllm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -845,10 +845,10 @@ namespace fastllm {
stringToTokenDict[s] = tokenId;
}

void Tokenizer::SetSpecialTokens(const std::map<std::string, int>& specialTokens) {
void Tokenizer::SetSpecialTokens(const std::map<std::string, int>& specialTokenMap) {
if (specialRoot == nullptr)
specialRoot = new TrieNode();
for (auto &it : specialTokens) {
for (auto &it : specialTokenMap) {
TrieNode *now = this->specialRoot;
for (int i = 0; i < it.first.size(); i++) {
if (now->next.find(it.first[i]) == now->next.end()) {
Expand All @@ -857,8 +857,10 @@ namespace fastllm {
now = now->next[it.first[i]];
}
now->tokenId = it.second;
now->score = 1.0;
now->score = 0.0f;
tokenToStringDict[it.second] = it.first;
stringToTokenDict[it.first] = it.second;
specialTokens.push_back(it.first);
}
}

Expand Down Expand Up @@ -1384,12 +1386,12 @@ namespace fastllm {
}

void WeightMap::LoadFromFile(const std::string &fileName) {
#ifdef USE_MMAP
#ifdef USE_MMAP
std::shared_ptr<FileMmap> mapped_file = std::make_shared<FileMmap>(fileName);
ModelLoader buffer((char *)mapped_file->data, mapped_file->size);
#else
#else
FileBuffer buffer(fileName);
#endif
#endif
this->versionId = buffer.ReadInt();

if (this->versionId >= 1) {
Expand Down Expand Up @@ -1419,7 +1421,8 @@ namespace fastllm {
}
}

bool useScore = this->dicts["tokenizer_use_score"] == "1";
bool useScore = this->dicts.find("tokenizer_use_score") != this->dicts.end()
&& this->dicts["tokenizer_use_score"] == "1";
int vocabLen = buffer.ReadInt();
for (int i = 0; i < vocabLen; i++) {
int len = buffer.ReadInt();
Expand All @@ -1431,6 +1434,18 @@ namespace fastllm {
float score = useScore ? buffer.ReadFloat() : -i;
tokenizer.Insert(x, id, score);
}
bool hasSpecialTokens = this->dicts.find("tokenizer_has_special_tokens") != this->dicts.end()
&& this->dicts["tokenizer_has_special_tokens"] == "1";
if (hasSpecialTokens) {
std::map <std::string, int> specialTokens;
int specialTokenLen = buffer.ReadInt();
for (int i = 0; i < specialTokenLen; i++) {
std::string token = buffer.ReadString();
int id = tokenizer.stringToTokenDict[token];
specialTokens[token] = id;
}
tokenizer.SetSpecialTokens(specialTokens);
}

int len = buffer.ReadInt();
for (int i = 0; i < len; i++) {
Expand All @@ -1448,10 +1463,10 @@ namespace fastllm {
weight[name] = Data(dataType, dims);

if (lowMemMode && this->embeddingNames.find(name) != this->embeddingNames.end()) {
if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) {
weight[name].fileName = fileName;
if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) {
weight[name].fileName = fileName;
#if defined(_WIN32) or defined(_WIN64)
weight[name].filePos = _ftelli64(buffer.f);
weight[name].filePos = _ftelli64(buffer.f);
#else
#ifdef USE_MMAP
weight[name].filePos = buffer.tell();
Expand All @@ -1462,44 +1477,44 @@ namespace fastllm {
#ifdef USE_MMAP
buffer.seek(weight[name].GetBytes(), SEEK_CUR);
#else
fseek(buffer.f, weight[name].GetBytes(), SEEK_CUR);
fseek(buffer.f, weight[name].GetBytes(), SEEK_CUR);
#endif
} else {
ErrorInFastLLM("Error: embedding's type should be float32 or bfloat16.\n");
}
} else {
ErrorInFastLLM("Error: embedding's type should be float32 or bfloat16.\n");
}
} else {
#ifdef USE_MMAP
weight[name].SetMapFile(mapped_file);
weight[name].expansionBytes = (weight[name].Count(0) * weight[name].unitSize - 1) / weight[name].unitSizeDiv + 1;
#else
weight[name].Allocate();
weight[name].Allocate();
#endif
if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) {
if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) {
#ifdef USE_MMAP
weight[name].cpuData = buffer.ReadBytes(weight[name].GetBytes());
#else
buffer.ReadBytes(weight[name].cpuData, weight[name].GetBytes());
#endif
} else if (dataType == DataType::INT8 || dataType == DataType::INT4) {
int bit = (dataType == DataType::INT4 ? 4 : 8);
weight[name].perChannelAxis = buffer.ReadInt();
int k = weight[name].perChannelAxis == -1 ? 1 : dims[weight[name].perChannelAxis];
weight[name].perChannelsConfigs.resize(k);
weight[name].zeros.resize(k);
weight[name].scales.resize(k);
for (int i = 0; i < k; i++) {
float minValue = buffer.ReadFloat();
float maxValue = buffer.ReadFloat();
weight[name].perChannelsConfigs[i] = LowBitConfig(minValue, maxValue, bit, 0);
weight[name].zeros[i] = weight[name].perChannelsConfigs[i].zeroPoint;
weight[name].scales[i] = weight[name].perChannelsConfigs[i].scale;
}
} else if (dataType == DataType::INT8 || dataType == DataType::INT4) {
int bit = (dataType == DataType::INT4 ? 4 : 8);
weight[name].perChannelAxis = buffer.ReadInt();
int k = weight[name].perChannelAxis == -1 ? 1 : dims[weight[name].perChannelAxis];
weight[name].perChannelsConfigs.resize(k);
weight[name].zeros.resize(k);
weight[name].scales.resize(k);
for (int i = 0; i < k; i++) {
float minValue = buffer.ReadFloat();
float maxValue = buffer.ReadFloat();
weight[name].perChannelsConfigs[i] = LowBitConfig(minValue, maxValue, bit, 0);
weight[name].zeros[i] = weight[name].perChannelsConfigs[i].zeroPoint;
weight[name].scales[i] = weight[name].perChannelsConfigs[i].scale;
}
#ifdef USE_MMAP
weight[name].cpuData = buffer.ReadBytes(weight[name].GetBytes());
#else
buffer.ReadBytes(weight[name].cpuData, weight[name].GetBytes());
#endif
} else if (dataType == DataType::INT4_NOZERO) {
} else if (dataType == DataType::INT4_NOZERO) {
int bit = 4;
weight[name].perChannelAxis = buffer.ReadInt();
int k = weight[name].perChannelAxis == -1 ? 1 : dims[weight[name].perChannelAxis];
Expand All @@ -1513,11 +1528,11 @@ namespace fastllm {
weight[name].mins[i] = weight[name].perChannelsConfigs[i].min;
weight[name].scales[i] = weight[name].perChannelsConfigs[i].scale;
}
#ifdef USE_MMAP
#ifdef USE_MMAP
weight[name].cpuData = buffer.ReadBytes(weight[name].GetBytes());
#else
#else
buffer.ReadBytes(weight[name].cpuData, weight[name].GetBytes());
#endif
#endif
}
}

Expand Down Expand Up @@ -1573,7 +1588,8 @@ namespace fastllm {
}

// 写入词表
bool useScore = this->dicts["tokenizer_use_score"] == "1";
bool useScore = this->dicts.find("tokenizer_use_score") != this->dicts.end()
&& this->dicts["tokenizer_use_score"] == "1";
buffer.WriteInt((int)tokenizer.tokenToStringDict.size());
for (auto &it : tokenizer.tokenToStringDict) {
buffer.WriteInt((int)it.second.size());
Expand All @@ -1585,6 +1601,15 @@ namespace fastllm {
buffer.WriteFloat(tokenizer.tokenToScoreDict[it.first]);
}
}
bool hasSpecialTokens = this->dicts.find("tokenizer_has_special_tokens") != this->dicts.end()
&& this->dicts["tokenizer_has_special_tokens"] == "1";
if (hasSpecialTokens) {
int specialTokenLen = tokenizer.specialTokens.size();
buffer.WriteInt(specialTokenLen);
for (int i = 0; i < specialTokenLen; i++) {
buffer.WriteString(tokenizer.specialTokens[i]);
}
}

// 写入权重
int need = 0;
Expand Down
6 changes: 5 additions & 1 deletion src/pybinding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,9 @@ PYBIND11_MODULE(pyfastllm, m) {


py::class_<fastllm::Tokenizer>(m, "Tokenizer")
.def_readonly("add_dummy_prefix", &fastllm::Tokenizer::addDummyPrefix)
.def_readonly("remove_extra_whitespaces", &fastllm::Tokenizer::removeExtraWhitespaces)
.def_readonly("byte_as_char", &fastllm::Tokenizer::byteAsChar)
.def("encode", &fastllm::Tokenizer::Encode)
// .def("decode", &fastllm::Tokenizer::Decode)
.def("decode", &fastllm::Tokenizer::Decode, "Decode from Tensor")
Expand All @@ -273,7 +276,8 @@ PYBIND11_MODULE(pyfastllm, m) {
return py::bytes(ret);
})
.def("clear", &fastllm::Tokenizer::Clear)
.def("insert", &fastllm::Tokenizer::Insert);
.def("insert", &fastllm::Tokenizer::Insert)
.def("set_special_tokens", &fastllm::Tokenizer::SetSpecialTokens);

py::class_<fastllm::WeightMap>(m, "WeightMap")
.def_readonly("tokenizer", &fastllm::WeightMap::tokenizer)
Expand Down
11 changes: 10 additions & 1 deletion tools/fastllm_pytools/hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,15 @@ def create(model,

if tokenizer:
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
if len(tokenizer.all_special_tokens) > 0:
token_set = set()
for token in [tokenizer.bos_token, tokenizer.eos_token, tokenizer.unk_token, tokenizer.pad_token]:
for prompt in [pre_prompt, user_role, bot_role, history_sep]:
if prompt and str(token) in prompt:
modelInfo["tokenizer_has_special_tokens"] = "1"
token_set.add(str(token))
if len(tokenizer.all_special_tokens) > len(token_set):
modelInfo["tokenizer_has_special_tokens"] = "1"
if hasattr(tokenizer, "sp_model") or (hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "sp_model")):
try:
import sentencepiece.sentencepiece_model_pb2 as model_pb2
Expand Down Expand Up @@ -144,7 +153,7 @@ def create(model,
llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, v, vocab[v], ctypes.c_float(1.0));
else:
llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, v.encode(), vocab[v], ctypes.c_float(score));
if (len(tokenizer.all_special_tokens) > 0):
if ("tokenizer_has_special_tokens" in modelInfo):
special_tokens_str = ''.join(tokenizer.all_special_tokens)
special_tokens_len = [len(x) for x in tokenizer.all_special_tokens]
special_tokens_ids = tokenizer.all_special_ids
Expand Down
14 changes: 14 additions & 0 deletions tools/fastllm_pytools/torch2flm.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,15 @@ def tofile(exportPath,

if tokenizer:
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
if len(tokenizer.all_special_tokens) > 0:
token_set = set()
for token in [tokenizer.bos_token, tokenizer.eos_token, tokenizer.unk_token, tokenizer.pad_token]:
for prompt in [pre_prompt, user_role, bot_role, history_sep]:
if prompt and str(token) in prompt:
modelInfo["tokenizer_has_special_tokens"] = "1"
token_set.add(str(token))
if len(tokenizer.all_special_tokens) > len(token_set):
modelInfo["tokenizer_has_special_tokens"] = "1"
if hasattr(tokenizer, "sp_model") or (hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "sp_model")):
try:
import sentencepiece.sentencepiece_model_pb2 as model_pb2
Expand Down Expand Up @@ -200,6 +209,11 @@ def tofile(exportPath,
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', vocab[v]))
fo.write(struct.pack('f', score))
if ("tokenizer_has_special_tokens" in modelInfo):
fo.write(struct.pack('i', len(tokenizer.all_special_tokens)))
for special_token in tokenizer.all_special_tokens:
fo.write(struct.pack('i', len(special_token)))
fo.write(special_token.encode())
else:
fo.write(struct.pack('i', 0))

Expand Down

0 comments on commit 7062e85

Please sign in to comment.