diff --git a/docs/llama_cookbook.md b/docs/llama_cookbook.md index 3110653e..8c4c225e 100644 --- a/docs/llama_cookbook.md +++ b/docs/llama_cookbook.md @@ -8,7 +8,13 @@ LLaMA类模型有着基本相同的结构,但权重和prompt构造有差异。 以下配置方案根据模型的源代码整理,不保证模型推理结果与原版完全一致。 -## 修改脚本并转换 +## 修改方式 + +目前,转换脚本和两行加速方式均可用于llama类模型。但无论采用哪一种方式,都需要预留足够的内存(可以用swap空间)。 + +在float16模式下,转换时约需要4×参数量+1GB的空闲内存。 + +### 转换脚本 这里以支持推理各类Llama结构的基座模型为例,介绍如何应用本文档。 @@ -40,17 +46,36 @@ LLaMA类模型有着基本相同的结构,但权重和prompt构造有差异。 如需添加Token ID而非字符串(类似baichuan-chat模型),可以使用“”的格式添加。 +* 执行脚本 + +```shell +python3 tools/alpaca2flm.py [输出文件名] [精度] [原始模型名称或路径] +``` + ### 两行加速 ```python + conf = model.config.__dict__ + conf["model_type"] = "llama" llm.from_hf(model, tokenizer, pre_prompt = "", user_role = "", bot_role = "", history_sep = "", dtype = dtype) ``` +## 对齐 + +如果想使fastllm模型和原版transformers模型基本一致,最主要的操作是对齐tokenizer。 +如果模型使用了huggingface 加速版本的Tokenizers(即模型目录中包含`tokenizer.json`并优先使用),目前的转换脚本**仅在从本地文件转换时,能够对齐tokenizer**。 + +注意检查原始tokenizer的`encode()`方法返回的结果前面是否会加空格。如果原始tokenizer没有加空格,则需要设置: + +```python + conf["tokenizer_add_dummy_prefix"] = False +``` + ## Base Model -见上方“[修改方案](#修改方案)”。 +见上方“[修改方案](#修改方式)”。 一部分模型需要制定bos_token_id,假设bos_token_id为1则可以配置如下: @@ -66,6 +91,8 @@ LLaMA类模型有着基本相同的结构,但权重和prompt构造有差异。 ### InternLM(书生) +* internlm/[internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) +* internlm/[internlm-chat-7b v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1_1) * internlm/[internlm-chat-20b](https://huggingface.co/internlm/internlm-chat-20b) ```python @@ -76,6 +103,15 @@ LLaMA类模型有着基本相同的结构,但权重和prompt构造有差异。 history_sep = "\n", dtype = dtype) ``` +可以直接使用`internlm2flm.py`脚本转换: + +``` sh +cd build +python3 tools/internlm2flm.py internlm-7b-fp16.flm float16 #导出float16模型 +python3 tools/internlm2flm.py internlm-7b-int8.flm int8 #导出int8模型 +python3 tools/internlm2flm.py internlm-7b-int4.flm int4 #导出int4模型 +python3 tools/internlm2flm.py internlm-7b-int4.flm float16 internlm/internlm-chat-7b #导出internlm-chat-7b float16模型 +``` ### XVERSE @@ -85,10 +121,12 @@ LLaMA类模型有着基本相同的结构,但权重和prompt构造有差异。 ```python conf = model.config.__dict__ conf["model_type"] = "llama" + conf["tokenizer_add_dummy_prefix"] = False torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", user_role = "Human: ", bot_role = "\n\nAssistant: ", history_sep = "", dtype = dtype) ``` +XVERSE-13B-Chat V1 版本需要对输入做NFKC规范化,fastllm暂不支持,因此需要使用原始tokenizer. ### 其他 llama1 系列 @@ -163,7 +201,7 @@ LLaMA类模型有着基本相同的结构,但权重和prompt构造有差异。 ```python torch2flm.tofile(exportPath, model, tokenizer, pre_prompt="The following is a conversation between a human and an AI assistant namely YuLan, developed by GSAI, Renmin University of China. " \ - "The AI assistant gives helpful, detailed, and polite answers to the user's questions.\n" + "The AI assistant gives helpful, detailed, and polite answers to the user's questions.\n", user_role="[|Human|]:", bot_role="\n[|AI|]:", history_sep="\n", dtype=dtype) ``` @@ -174,7 +212,7 @@ LLaMA类模型有着基本相同的结构,但权重和prompt构造有差异。 ```python torch2flm.tofile(exportPath, model, tokenizer, - pre_prompt="Below is an instruction that describes a task. " - "Write a response that appropriately completes the request.\n\n" + pre_prompt="Below is an instruction that describes a task. " \ + "Write a response that appropriately completes the request.\n\n", user_role="### Instruction:\n", bot_role="\n\n### Response:", history_sep="\n", dtype=dtype) ``` diff --git a/include/fastllm.h b/include/fastllm.h index d945dcb5..6edab351 100644 --- a/include/fastllm.h +++ b/include/fastllm.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include "devices/cpu/cputhreadpool.h" #ifdef USE_SENTENCEPIECE @@ -43,7 +45,7 @@ namespace fastllm { float top_p = 1.0; // top_p采样 float temperature = 1.0; // 温度参数,一般在0.1 ~ 1.0之间,设大这个参数可以带来结果的多样性 bool output_logits = false; // 是否返回logits - bool enable_hash_id = false; // 给会话添加hash id + bool enable_hash_id = false; // 给会话添加hash id std::multiset stop_token_ids; bool IsSimpleGreedy() const { @@ -359,11 +361,22 @@ namespace fastllm { TrieNode *root; + TrieNode *specialRoot = nullptr; + TokenizerType type = TokenizerType::BPE; + bool addDummyPrefix = true; // 是否在首位添加空格 + bool removeExtraWhitespaces = true; // 是否将多个空格合并为一个 + bool byteAsChar = false; // 是否将byte变为展示字符 + std::unordered_map tokenToStringDict; std::unordered_map tokenToScoreDict; std::unordered_map stringToTokenDict; + std::vector specialTokens; + + std::wstring_convert> converter; + std::unordered_map byteCharDict; + std::unordered_map charByteDict; #ifdef USE_SENTENCEPIECE std::unique_ptr spProcessor; #endif @@ -380,6 +393,10 @@ namespace fastllm { void Insert(const std::string &s, int tokenId, float score = 1.0f); // 插入一个token + void SetSpecialTokens(const std::map &specialTokens); // 设置需要优先处理的特殊token + + std::string Normalize(const std::string &ori); // 字符规范化 + Data Encode(const std::string &s); // 编码 std::string Decode(const Data &data); // 解码 diff --git a/src/fastllm.cpp b/src/fastllm.cpp index 4572a2f2..be044e55 100644 --- a/src/fastllm.cpp +++ b/src/fastllm.cpp @@ -783,6 +783,18 @@ namespace fastllm { Tokenizer::Tokenizer() { root = new TrieNode(); + int n = 0; + wchar_t special_token = L'\x0'; + for (; special_token < L'!'; special_token++, n++) { + byteCharDict[L'\x100' + n] = special_token; + charByteDict[special_token] = L'\x100' + n; + } + for (special_token = L'\x7F'; special_token < L'\xA1'; special_token++, n++) { + byteCharDict[L'\x100' + n] = special_token; + charByteDict[special_token] = L'\x100' + n; + } + byteCharDict[L'\x100' + n++] = L'\xAD'; + charByteDict[L'\xAD'] = L'\x100' + n++; } Tokenizer::~Tokenizer() { @@ -799,8 +811,23 @@ namespace fastllm { q.push_back(it.second); } } + if (specialRoot != nullptr) { + q.push_back(specialRoot); + for (int i = q.size() - 1; i < q.size(); i++) { + TrieNode *now = q[i]; + for (auto it : now->next) { + q.push_back(it.second); + } + } + } + for (TrieNode * node : q) + delete node; + q.clear(); root = new TrieNode(); + specialRoot = nullptr; tokenToStringDict.clear(); + tokenToScoreDict.clear(); + stringToTokenDict.clear(); } void Tokenizer::Insert(const std::string &s, int tokenId, float score) { @@ -818,6 +845,25 @@ namespace fastllm { stringToTokenDict[s] = tokenId; } + void Tokenizer::SetSpecialTokens(const std::map& specialTokenMap) { + if (specialRoot == nullptr) + specialRoot = new TrieNode(); + for (auto &it : specialTokenMap) { + TrieNode *now = this->specialRoot; + for (int i = 0; i < it.first.size(); i++) { + if (now->next.find(it.first[i]) == now->next.end()) { + now->next[it.first[i]] = new TrieNode(); + } + now = now->next[it.first[i]]; + } + now->tokenId = it.second; + now->score = 0.0f; + tokenToStringDict[it.second] = it.first; + stringToTokenDict[it.first] = it.second; + specialTokens.push_back(it.first); + } + } + void Tokenizer::TryMergePairs(std::vector &symbols, int l, int r, std::priority_queue &q) { if (l == -1 || r == -1 || symbols[l].len == 0 || symbols[r].len == 0) { return; @@ -850,24 +896,39 @@ namespace fastllm { return std::numeric_limits::max(); } - Data Tokenizer::Encode(const std::string &ori) { - if (this->type == TokenizerType::BPE) { - std::string blank = ""; - blank += 226, blank += 150, blank += 129; - std::string s = blank; - if (15 < ori.size() && ori.substr(0, 15) == "byteAsChar) { + std::wstring ws(ori.size(), L' '); + for (int i=0; i < ori.length(); i++) { + wchar_t wi = static_cast(static_cast(ori[i])); + if (charByteDict.find(wi) != charByteDict.end()) { + wi = charByteDict[wi]; + } + ws[i] = wi; } - for (int i = 0; i < ori.size(); i++) { - if (ori[i] == ' ') { - // if (i != 0 && ori[i - 1] != ' ') { - // s += blank; - // } + return converter.to_bytes(ws); + } + std::string blank = ""; + blank += 226, blank += 150, blank += 129; + std::string s = this->addDummyPrefix ? blank : ""; + if (15 < ori.size() && ori.substr(0, 15) == "removeExtraWhitespaces && i > 0 && ori[i - 1] == ' ')) { s += blank; - } else { - s += ori[i]; } + } else { + s += ori[i]; } + } + return s; + } + + Data Tokenizer::Encode(const std::string &ori) { + if (this->type == TokenizerType::BPE) { + std::string s = Normalize(ori); std::vector symbols; for (int i = 0; i < s.size(); i++) { @@ -885,6 +946,22 @@ namespace fastllm { } } + if (this->specialRoot != nullptr) { + TrieNode *now = this->specialRoot; + int next = i; + for (; next < s.size(); next++) { + if (now->next.find(s[next]) == now->next.end()) + break; + now = now->next[s[next]]; + } + if (now->tokenId != -999999 && next > i) { + symbols.push_back(Symbol(nullptr, (char *)s.data(), i, 0, (int) symbols.size() - 1, + (int) symbols.size() + 1, now->tokenId)); + i = next - 1; + continue; + } + } + int tokenId = -999999, pos = i - 1; TrieNode *now = this->root; for (int j = i; j < s.size(); j++) { @@ -956,52 +1033,41 @@ namespace fastllm { } } } - return Data (DataType::FLOAT32, {1, (int)v.size()}, v); + return Data(DataType::FLOAT32, {1, (int)v.size()}, v); } else if (this->type == TokenizerType::GLM) { const std::map specialTokens = {{"[MASK]", 50003}, {"[sMASK]", 50008}, {"[gMASK]", 50009}}; - std::string blank = ""; - blank += 226, blank += 150, blank += 129; - std::string s = blank; - for (int i = 0; i < ori.size(); i++) { - if (ori[i] == ' ') { - if (i != 0 && ori[i - 1] != ' ') { - s += blank; - } - } else { - s += ori[i]; - } - } + std::string s = Normalize(ori); std::vector v; - int findPos=0; - while(findPos=0&&(nextSpecialTokenPos<0||ind= 0 && (nextSpecialTokenPos < 0 || ind < nextSpecialTokenPos)) { + nextSpecialTokenPos = ind; + nextSpecialToken = p.second; + nextSpecialTokenLen = p.first.length(); } } std::string subStr; - if(nextSpecialTokenPos<0){ - subStr=s.substr(findPos); - findPos=s.length(); - }else{ - subStr=s.substr(findPos,nextSpecialTokenPos-findPos); - findPos=nextSpecialTokenPos+nextSpecialTokenLen; + if (nextSpecialTokenPos < 0) { + subStr = s.substr(findPos); + findPos = s.length(); + } else { + subStr = s.substr(findPos, nextSpecialTokenPos - findPos); + findPos = nextSpecialTokenPos + nextSpecialTokenLen; } - if(subStr.length()>0){ + if (subStr.length() > 0) { #ifdef USE_SENTENCEPIECE - if(spProcessor!=nullptr){ + if (spProcessor!=nullptr) { std::vector ids; - spProcessor->Encode(subStr,&ids); - for(int id:ids){ + spProcessor->Encode(subStr, &ids); + fo r(int id : ids) { v.push_back(id); } - }else{ + } else { #endif std::vector symbols; for (int i = 0; i < subStr.size(); i++) { @@ -1078,7 +1144,7 @@ namespace fastllm { } #endif } - if(nextSpecialTokenPos>=0){ + if (nextSpecialTokenPos >= 0) { v.push_back(nextSpecialToken); } } @@ -1239,6 +1305,17 @@ namespace fastllm { ret.replace(pos, blank.length(), " "); else break; } + if (this->byteAsChar) { + std::wstring wret = converter.from_bytes(ret); + std::string decoded(wret.size(), ' '); + for (int i=0; i < wret.length(); i++) { + if (byteCharDict.find(wret[i]) != byteCharDict.end()) { + wret[i] = byteCharDict[wret[i]]; + } + decoded[i] = static_cast(wret[i]); + } + ret = decoded; + } int pos = ret.find("<|blank_"); if (pos != -1) { int space_num = atoi(ret.substr(8, ret.size() - 10).c_str()); @@ -1313,12 +1390,12 @@ namespace fastllm { } void WeightMap::LoadFromFile(const std::string &fileName) { - #ifdef USE_MMAP +#ifdef USE_MMAP std::shared_ptr mapped_file = std::make_shared(fileName); ModelLoader buffer((char *)mapped_file->data, mapped_file->size); - #else +#else FileBuffer buffer(fileName); - #endif +#endif this->versionId = buffer.ReadInt(); if (this->versionId >= 1) { @@ -1348,7 +1425,8 @@ namespace fastllm { } } - bool useScore = this->dicts["tokenizer_use_score"] == "1"; + bool useScore = this->dicts.find("tokenizer_use_score") != this->dicts.end() + && this->dicts["tokenizer_use_score"] == "1"; int vocabLen = buffer.ReadInt(); for (int i = 0; i < vocabLen; i++) { int len = buffer.ReadInt(); @@ -1360,6 +1438,18 @@ namespace fastllm { float score = useScore ? buffer.ReadFloat() : -i; tokenizer.Insert(x, id, score); } + bool hasSpecialTokens = this->dicts.find("tokenizer_has_special_tokens") != this->dicts.end() + && this->dicts["tokenizer_has_special_tokens"] == "1"; + if (hasSpecialTokens) { + std::map specialTokens; + int specialTokenLen = buffer.ReadInt(); + for (int i = 0; i < specialTokenLen; i++) { + std::string token = buffer.ReadString(); + int id = tokenizer.stringToTokenDict[token]; + specialTokens[token] = id; + } + tokenizer.SetSpecialTokens(specialTokens); + } int len = buffer.ReadInt(); for (int i = 0; i < len; i++) { @@ -1377,10 +1467,10 @@ namespace fastllm { weight[name] = Data(dataType, dims); if (lowMemMode && this->embeddingNames.find(name) != this->embeddingNames.end()) { - if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) { - weight[name].fileName = fileName; + if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) { + weight[name].fileName = fileName; #if defined(_WIN32) or defined(_WIN64) - weight[name].filePos = _ftelli64(buffer.f); + weight[name].filePos = _ftelli64(buffer.f); #else #ifdef USE_MMAP weight[name].filePos = buffer.tell(); @@ -1391,44 +1481,44 @@ namespace fastllm { #ifdef USE_MMAP buffer.seek(weight[name].GetBytes(), SEEK_CUR); #else - fseek(buffer.f, weight[name].GetBytes(), SEEK_CUR); + fseek(buffer.f, weight[name].GetBytes(), SEEK_CUR); #endif - } else { - ErrorInFastLLM("Error: embedding's type should be float32 or bfloat16.\n"); - } + } else { + ErrorInFastLLM("Error: embedding's type should be float32 or bfloat16.\n"); + } } else { #ifdef USE_MMAP weight[name].SetMapFile(mapped_file); weight[name].expansionBytes = (weight[name].Count(0) * weight[name].unitSize - 1) / weight[name].unitSizeDiv + 1; #else - weight[name].Allocate(); + weight[name].Allocate(); #endif - if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) { + if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) { #ifdef USE_MMAP weight[name].cpuData = buffer.ReadBytes(weight[name].GetBytes()); #else buffer.ReadBytes(weight[name].cpuData, weight[name].GetBytes()); #endif - } else if (dataType == DataType::INT8 || dataType == DataType::INT4) { - int bit = (dataType == DataType::INT4 ? 4 : 8); - weight[name].perChannelAxis = buffer.ReadInt(); - int k = weight[name].perChannelAxis == -1 ? 1 : dims[weight[name].perChannelAxis]; - weight[name].perChannelsConfigs.resize(k); - weight[name].zeros.resize(k); - weight[name].scales.resize(k); - for (int i = 0; i < k; i++) { - float minValue = buffer.ReadFloat(); - float maxValue = buffer.ReadFloat(); - weight[name].perChannelsConfigs[i] = LowBitConfig(minValue, maxValue, bit, 0); - weight[name].zeros[i] = weight[name].perChannelsConfigs[i].zeroPoint; - weight[name].scales[i] = weight[name].perChannelsConfigs[i].scale; - } + } else if (dataType == DataType::INT8 || dataType == DataType::INT4) { + int bit = (dataType == DataType::INT4 ? 4 : 8); + weight[name].perChannelAxis = buffer.ReadInt(); + int k = weight[name].perChannelAxis == -1 ? 1 : dims[weight[name].perChannelAxis]; + weight[name].perChannelsConfigs.resize(k); + weight[name].zeros.resize(k); + weight[name].scales.resize(k); + for (int i = 0; i < k; i++) { + float minValue = buffer.ReadFloat(); + float maxValue = buffer.ReadFloat(); + weight[name].perChannelsConfigs[i] = LowBitConfig(minValue, maxValue, bit, 0); + weight[name].zeros[i] = weight[name].perChannelsConfigs[i].zeroPoint; + weight[name].scales[i] = weight[name].perChannelsConfigs[i].scale; + } #ifdef USE_MMAP weight[name].cpuData = buffer.ReadBytes(weight[name].GetBytes()); #else buffer.ReadBytes(weight[name].cpuData, weight[name].GetBytes()); #endif - } else if (dataType == DataType::INT4_NOZERO) { + } else if (dataType == DataType::INT4_NOZERO) { int bit = 4; weight[name].perChannelAxis = buffer.ReadInt(); int k = weight[name].perChannelAxis == -1 ? 1 : dims[weight[name].perChannelAxis]; @@ -1442,11 +1532,11 @@ namespace fastllm { weight[name].mins[i] = weight[name].perChannelsConfigs[i].min; weight[name].scales[i] = weight[name].perChannelsConfigs[i].scale; } -#ifdef USE_MMAP + #ifdef USE_MMAP weight[name].cpuData = buffer.ReadBytes(weight[name].GetBytes()); -#else + #else buffer.ReadBytes(weight[name].cpuData, weight[name].GetBytes()); -#endif + #endif } } @@ -1502,7 +1592,8 @@ namespace fastllm { } // 写入词表 - bool useScore = this->dicts["tokenizer_use_score"] == "1"; + bool useScore = this->dicts.find("tokenizer_use_score") != this->dicts.end() + && this->dicts["tokenizer_use_score"] == "1"; buffer.WriteInt((int)tokenizer.tokenToStringDict.size()); for (auto &it : tokenizer.tokenToStringDict) { buffer.WriteInt((int)it.second.size()); @@ -1514,6 +1605,15 @@ namespace fastllm { buffer.WriteFloat(tokenizer.tokenToScoreDict[it.first]); } } + bool hasSpecialTokens = this->dicts.find("tokenizer_has_special_tokens") != this->dicts.end() + && this->dicts["tokenizer_has_special_tokens"] == "1"; + if (hasSpecialTokens) { + int specialTokenLen = tokenizer.specialTokens.size(); + buffer.WriteInt(specialTokenLen); + for (int i = 0; i < specialTokenLen; i++) { + buffer.WriteString(tokenizer.specialTokens[i]); + } + } // 写入权重 int need = 0; diff --git a/src/model.cpp b/src/model.cpp index 401e5ab9..5c919905 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -2,6 +2,7 @@ #include "model.h" #include "fastllm.h" +#include #include "chatglm.h" #include "moss.h" @@ -51,6 +52,24 @@ namespace fastllm { if (this->weight.dicts.find("history_sep") != this->weight.dicts.end()) { history_sep = this->weight.dicts["history_sep"]; } + if (this->weight.dicts.find("tokenizer_add_dummy_prefix") != this->weight.dicts.end()) { + std::string value = this->weight.dicts["tokenizer_add_dummy_prefix"]; + transform(value.begin(), value.end(), value.begin(), ::tolower); + std::istringstream iss(value); + iss >> std::boolalpha >> this->weight.tokenizer.addDummyPrefix; + } + if (this->weight.dicts.find("tokenizer_remove_extra_whitespaces") != this->weight.dicts.end()) { + std::string value = this->weight.dicts["tokenizer_remove_extra_whitespaces"]; + transform(value.begin(), value.end(), value.begin(), ::tolower); + std::istringstream iss(value); + iss >> std::boolalpha >> this->weight.tokenizer.removeExtraWhitespaces; + } + if (this->weight.dicts.find("tokenizer_byte_as_char") != this->weight.dicts.end()) { + std::string value = this->weight.dicts["tokenizer_byte_as_char"]; + transform(value.begin(), value.end(), value.begin(), ::tolower); + std::istringstream iss(value); + iss >> std::boolalpha >> this->weight.tokenizer.byteAsChar; + } this->deviceMap = GetDeviceMap(); } @@ -69,7 +88,7 @@ namespace fastllm { model = (basellm*)(new ChatGLMModel()); } else if (modelType == "moss") { model = (basellm*)(new MOSSModel()); - model->weight.tokenizer.type = Tokenizer::TokenizerType::NORMAL; + model->weight.tokenizer.type = Tokenizer::TokenizerType::BPE; model->eos_token_id = 106068; } else if (modelType == "baichuan") { model = (basellm*)(new LlamaModel()); @@ -79,6 +98,9 @@ namespace fastllm { model->bot_role = "\n:"; model->history_sep = "\n"; model->weight.tokenizer.type = Tokenizer::TokenizerType::BPE; + } else if (modelType == "internlm") { + model = new LlamaModel(); + model->model_type = "internlm"; } else if (modelType == "llama") { model = (basellm*)(new LlamaModel()); } else if (modelType == "qwen") { diff --git a/src/models/llama.cpp b/src/models/llama.cpp index bf7f5294..07c4fbed 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -105,10 +105,14 @@ namespace fastllm { RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"], 1e-6, attenInput); std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight"; + std::string qBiasName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.bias"; std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight"; + std::string kBiasName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.bias"; std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight"; + std::string vBiasName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.bias"; std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight"; std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight"; + std::string oBiasName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.bias"; // 1.1 Get q, k, v int bsz = attenInput.dims[0], seqlen = attenInput.dims[1]; @@ -119,9 +123,12 @@ namespace fastllm { Split(qkv, -1, per, per * 2, k); Split(qkv, -1, per * 2, per * 3, v); } else { - Linear(attenInput, weight[qWeightName], Data(), q); - Linear(attenInput, weight[kWeightName], Data(), k); - Linear(attenInput, weight[vWeightName], Data(), v); + Data qBias = (weight.weight.find(qBiasName) != weight.weight.end()) ? weight[qBiasName] : Data(); + Data kBias = (weight.weight.find(kBiasName) != weight.weight.end()) ? weight[kBiasName] : Data(); + Data vBias = (weight.weight.find(vBiasName) != weight.weight.end()) ? weight[vBiasName] : Data(); + Linear(attenInput, weight[qWeightName], qBias, q); + Linear(attenInput, weight[kWeightName], kBias, k); + Linear(attenInput, weight[vWeightName], vBias, v); } std::vector qkvSize = {bsz, seqlen, num_attention_heads, -1}; @@ -198,7 +205,8 @@ namespace fastllm { PermuteSelf(attenOutput, {1, 0, 2}); attenOutput.Reshape({bsz, seqlen, -1}); - Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput); + Data oBias = (weight.weight.find(oBiasName) != weight.weight.end()) ? weight[oBiasName] : Data(); + Linear(attenOutput, weight[oWeightName], oBias, attenLastOutput); AddTo(hiddenStates, attenLastOutput); // 2. mlp RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput); @@ -267,10 +275,14 @@ namespace fastllm { RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"], 1e-6, attenInput); std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight"; + std::string qBiasName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.bias"; std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight"; + std::string kBiasName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.bias"; std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight"; + std::string vBiasName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.bias"; std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight"; std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight"; + std::string oBiasName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.bias"; // 1.1 Get q, k, v int bsz = attenInput.dims[0], seqlen = attenInput.dims[1]; @@ -281,9 +293,12 @@ namespace fastllm { Split(qkv, -1, per, per * 2, k); Split(qkv, -1, per * 2, per * 3, v); } else { - Linear(attenInput, weight[qWeightName], Data(), q); - Linear(attenInput, weight[kWeightName], Data(), k); - Linear(attenInput, weight[vWeightName], Data(), v); + Data qBias = (weight.weight.find(qBiasName) != weight.weight.end()) ? weight[qBiasName] : Data(); + Data kBias = (weight.weight.find(kBiasName) != weight.weight.end()) ? weight[kBiasName] : Data(); + Data vBias = (weight.weight.find(vBiasName) != weight.weight.end()) ? weight[vBiasName] : Data(); + Linear(attenInput, weight[qWeightName], qBias, q); + Linear(attenInput, weight[kWeightName], kBias, k); + Linear(attenInput, weight[vWeightName], vBias, v); } std::vector qkvSize = {bsz, seqlen, num_attention_heads, -1}; @@ -363,7 +378,8 @@ namespace fastllm { attenOutput.Reshape({seqlen, bsz, -1}); PermuteSelf(attenOutput, {1, 0, 2}); - Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput); + Data oBias = (weight.weight.find(oBiasName) != weight.weight.end()) ? weight[oBiasName] : Data(); + Linear(attenOutput, weight[oWeightName], oBias, attenLastOutput); AddTo(hiddenStates, attenLastOutput); // 2. mlp RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput); @@ -437,10 +453,14 @@ namespace fastllm { RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"], 1e-6, attenInput); std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight"; + std::string qBiasName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.bias"; std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight"; + std::string kBiasName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.bias"; std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight"; + std::string vBiasName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.bias"; std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight"; std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight"; + std::string oBiasName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.bias"; // 1.1 Get q, k, v int bsz = attenInput.dims[0], seqlen = attenInput.dims[1]; @@ -451,9 +471,12 @@ namespace fastllm { Split(qkv, -1, per, per * 2, k); Split(qkv, -1, per * 2, per * 3, v); } else { - Linear(attenInput, weight[qWeightName], Data(), q); - Linear(attenInput, weight[kWeightName], Data(), k); - Linear(attenInput, weight[vWeightName], Data(), v); + Data qBias = (weight.weight.find(qBiasName) != weight.weight.end()) ? weight[qBiasName] : Data(); + Data kBias = (weight.weight.find(kBiasName) != weight.weight.end()) ? weight[kBiasName] : Data(); + Data vBias = (weight.weight.find(vBiasName) != weight.weight.end()) ? weight[vBiasName] : Data(); + Linear(attenInput, weight[qWeightName], qBias, q); + Linear(attenInput, weight[kWeightName], kBias, k); + Linear(attenInput, weight[vWeightName], vBias, v); } Data attenOutput = Data(DataType::FLOAT32); @@ -556,7 +579,8 @@ namespace fastllm { CatDirect(attenOutput, curAttenOutput, 1); } - Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput); + Data oBias = (weight.weight.find(oBiasName) != weight.weight.end()) ? weight[oBiasName] : Data(); + Linear(attenOutput, weight[oWeightName], oBias, attenLastOutput); AddTo(hiddenStates, attenLastOutput); // 2. mlp RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput); @@ -600,9 +624,9 @@ namespace fastllm { #endif //auto st = std::chrono::system_clock::now(); #ifdef PY_API - size_t pos = input.rfind("time_stamp:"); - std::string prompt = (generationConfig.enable_hash_id && pos != -1)? input.substr(0, pos):input; - size_t hash_id = std::hash{}(input); + size_t pos = input.rfind("time_stamp:"); + std::string prompt = (generationConfig.enable_hash_id && pos != -1)? input.substr(0, pos):input; + size_t hash_id = std::hash{}(input); Data inputIds = this->weight.tokenizer.Encode(prompt); #else Data inputIds = this->weight.tokenizer.Encode(input); diff --git a/src/pybinding.cpp b/src/pybinding.cpp index 5452656f..41f5ff20 100644 --- a/src/pybinding.cpp +++ b/src/pybinding.cpp @@ -260,6 +260,9 @@ PYBIND11_MODULE(pyfastllm, m) { py::class_(m, "Tokenizer") + .def_readonly("add_dummy_prefix", &fastllm::Tokenizer::addDummyPrefix) + .def_readonly("remove_extra_whitespaces", &fastllm::Tokenizer::removeExtraWhitespaces) + .def_readonly("byte_as_char", &fastllm::Tokenizer::byteAsChar) .def("encode", &fastllm::Tokenizer::Encode) // .def("decode", &fastllm::Tokenizer::Decode) .def("decode", &fastllm::Tokenizer::Decode, "Decode from Tensor") @@ -273,7 +276,8 @@ PYBIND11_MODULE(pyfastllm, m) { return py::bytes(ret); }) .def("clear", &fastllm::Tokenizer::Clear) - .def("insert", &fastllm::Tokenizer::Insert); + .def("insert", &fastllm::Tokenizer::Insert) + .def("set_special_tokens", &fastllm::Tokenizer::SetSpecialTokens); py::class_(m, "WeightMap") .def_readonly("tokenizer", &fastllm::WeightMap::tokenizer) diff --git a/tools/fastllm_pytools/hf_model.py b/tools/fastllm_pytools/hf_model.py index 95fddd2d..e50e8cb3 100644 --- a/tools/fastllm_pytools/hf_model.py +++ b/tools/fastllm_pytools/hf_model.py @@ -1,12 +1,16 @@ from fastllm_pytools import llm; -import torch; import ctypes; -import numpy as np; +import builtins, os, json +import numpy as np +import torch +from transformers import PreTrainedTokenizerFast +from tokenizers.decoders import ByteLevel fastllm_data_type_dict = { "int4": 8, "int8": 3, - "float16": 7 + "float16": 7, + "float32": 0, } fastllm_weight_type_dict = { "linear": 1, @@ -49,7 +53,7 @@ def create(model, # Baichuan-13B-chat modelInfo["user_role"] = (" ") if hasattr(model.generation_config, "user_token_id") else ""; modelInfo["bot_role"] = ("") if hasattr(model.generation_config, "assistant_token_id") else ""; - modelInfo["history_sep"] = ""; + modelInfo["history_sep"] = "" if (modelInfo["model_type"] == "qwen"): if modelInfo["chat_format"] == "chatml": modelInfo["im_end_id"] = tokenizer.im_end_id @@ -61,20 +65,31 @@ def create(model, modelInfo["bot_role"] = ("")) + ">"); modelInfo["history_sep"] = ""; - modelInfo["tokenizer_use_score"] = "1" # 分词带分数 - - weight_type_dict = {}; - module_dict = {}; - weight_bits = {}; - for key, m in model.named_modules(): - if (str(type(m)).find("QuantizedLinear") != -1): - weight_type_dict[key + ".weight"] = "QuantizedLinear"; - weight_bits[key + ".weight"] = m.weight_bit_width; - if (isinstance(m, torch.nn.Linear)): - weight_type_dict[key + ".weight"] = "linear"; - module_dict[key + ".weight"] = m; - if (isinstance(m, torch.nn.Embedding)): - weight_type_dict[key] = "embedding"; + if tokenizer: + modelInfo["tokenizer_use_score"] = "1" # 分词带分数 + if len(tokenizer.all_special_tokens) > 0: + token_set = set() + for token in [tokenizer.bos_token, tokenizer.eos_token, tokenizer.unk_token, tokenizer.pad_token]: + for prompt in [pre_prompt, user_role, bot_role, history_sep]: + if prompt and str(token) in prompt: + modelInfo["tokenizer_has_special_tokens"] = "1" + token_set.add(str(token)) + if len(tokenizer.all_special_tokens) > len(token_set): + modelInfo["tokenizer_has_special_tokens"] = "1" + if hasattr(tokenizer, "sp_model") or (hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "sp_model")): + try: + import sentencepiece.sentencepiece_model_pb2 as model_pb2 + with open(tokenizer.vocab_file, "rb") as f: + sp_model_data = f.read() + sp_model_proto = model_pb2.ModelProto.FromString(sp_model_data) + modelInfo["tokenizer_add_dummy_prefix"] = sp_model_proto.normalizer_spec.add_dummy_prefix + modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces + except: + pass + elif isinstance(tokenizer, PreTrainedTokenizerFast): + if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \ + and isinstance(tokenizer._tokenizer.decoder, ByteLevel): + modelInfo["tokenizer_byte_as_char"] = True peft_config = {} active_adapter = "" @@ -85,7 +100,7 @@ def create(model, active_adapter = model.active_adapter model = model.cpu(); - dict = model.state_dict(); + dict = model.state_dict() if (modelInfo["model_type"] == "baichuan" and modelInfo["vocab_size"] == 125696): # normalize lm_head for Baichuan 2 @@ -94,16 +109,16 @@ def create(model, model.load_state_dict(dict) model_type = modelInfo["model_type"]; - model = llm.fastllm_lib.create_empty_llm_model(model_type.encode()); + model_handle = llm.fastllm_lib.create_empty_llm_model(model_type.encode()); for it in modelInfo.keys(): - llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode()); + llm.fastllm_lib.add_dict_llm_model(model_handle, str(it).encode(), str(modelInfo[it]).encode()); for adapter_name in peft_config.keys(): adapter_dict = peft_config[adapter_name].__dict__ for it in adapter_dict.keys(): - llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode()) + llm.fastllm_lib.add_adapter_dict_llm_model(model_handle, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode()) if len(active_adapter) != 0: - llm.fastllm_lib.set_adapter(model, str(active_adapter).encode()) + llm.fastllm_lib.set_adapter(model_handle, str(active_adapter).encode()) # 1. vocab if (tokenizer): @@ -111,62 +126,97 @@ def create(model, if modelInfo["model_type"] == "qwen": pass else: - tokenizer = tokenizer.tokenizer; + tokenizer = tokenizer.tokenizer if (hasattr(tokenizer, "sp_model")): - piece_size = tokenizer.sp_model.piece_size(); + piece_size = tokenizer.sp_model.piece_size() for i in range(piece_size): - llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(), + llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, tokenizer.sp_model.id_to_piece(i).encode(), i, ctypes.c_float(tokenizer.sp_model.get_score(i))); else: - vocab = tokenizer.get_vocab(); + merges = {} + if (modelInfo["model_type"] == "moss"): + merges = {("".join(bpe_tokens), token_index) for bpe_tokens, token_index in sorted(tokenizer.bpe_ranks.items(), key=lambda kv: kv[1])} + elif isinstance(tokenizer, PreTrainedTokenizerFast): + tokenizer_file = tokenizer.name_or_path + tokenizer.vocab_files_names['tokenizer_file'] + if os.path.exists(tokenizer_file): + with open(tokenizer_file, "r", encoding='utf-8') as f: + bpe_merges = json.load(f)["model"]["merges"] + bpe_merges = [pair.replace(" ", "") for pair in bpe_merges] + merges = builtins.dict(zip(bpe_merges, range(0, -len(bpe_merges), -1))) + vocab = tokenizer.get_vocab() for v in vocab.keys(): + score = merges[v] if v in merges else 1.0 if (modelInfo["model_type"] == "moss"): - vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]; - llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0)); + s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v] + llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, s, vocab[v], ctypes.c_float(score)); elif (modelInfo["model_type"] == "qwen"): - llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0)); + llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, v, vocab[v], ctypes.c_float(1.0)); else: - llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0)); - tot = 0; + llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, v.encode(), vocab[v], ctypes.c_float(score)); + if ("tokenizer_has_special_tokens" in modelInfo): + special_tokens_str = ''.join(tokenizer.all_special_tokens) + special_tokens_len = [len(x) for x in tokenizer.all_special_tokens] + special_tokens_ids = tokenizer.all_special_ids + llm.fastllm_lib.set_special_tokens_llm_model(model_handle, len(special_tokens_len), + (ctypes.c_int * len(special_tokens_len))(*special_tokens_len), + special_tokens_str.encode(), + (ctypes.c_int * len(special_tokens_ids))(*special_tokens_ids)); + + weight_type_dict = {} + module_dict = {} + weight_bits = {} + for key, m in model.named_modules(): + if (str(type(m)).find("QuantizedLinear") != -1): + weight_type_dict[key + ".weight"] = "QuantizedLinear"; + weight_bits[key + ".weight"] = m.weight_bit_width; + if (isinstance(m, torch.nn.Linear)): + weight_type_dict[key + ".weight"] = "linear" + module_dict[key + ".weight"] = m + if (isinstance(m, torch.nn.Embedding)): + weight_type_dict[key] = "embedding" + + # 2. weight + tot = 0 for key in dict: - ori_data_type = 0; - ori_np_data_type = np.float32; - cur_weight_type = 0; + ori_data_type = 0 + ori_np_data_type = np.float32 + cur_weight_type = 0 if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict): - cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]; - to_data_type = 0; + cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]] + to_data_type = 0 if (cur_weight_type == 1): - to_data_type = fastllm_data_type_dict[dtype]; + to_data_type = fastllm_data_type_dict[dtype] if (to_data_type == 7): - ori_data_type = 7; - ori_np_data_type = np.float16; + ori_data_type = 7 + ori_np_data_type = np.float16 elif (cur_weight_type == 2): # TODO bfloat - to_data_type = 0; + to_data_type = 0 weight_name = key - if peft_config is not None: + if hasattr(model, "peft_config"): weight_name = weight_name.replace('base_model.model.', '') if (cur_weight_type == 111): - llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(), + llm.fastllm_lib.add_qlinear_weight_llm_model(model_handle, weight_name.encode(), len(dict[key].shape), (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)), weight_bits[key], dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p), dict[key].numpy().ctypes.data_as(ctypes.c_void_p)); else: - llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(), + llm.fastllm_lib.add_weight_llm_model(model_handle, weight_name.encode(), len(dict[key].shape), (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)), to_data_type, cur_weight_type, ori_data_type, dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p)); tot += 1; print("convert (", tot, "/", len(dict), end = " )\r"); + dict[key].to(torch.device("meta")) print(""); - llm.fastllm_lib.init_params_llm_model(model); - llm.fastllm_lib.warmup_llm_model(model); - ret = llm.model("", id = model); + llm.fastllm_lib.init_params_llm_model(model_handle); + llm.fastllm_lib.warmup_llm_model(model_handle); + ret = llm.model("", id = model_handle); return ret; diff --git a/tools/fastllm_pytools/llm.py b/tools/fastllm_pytools/llm.py index 426f7aa0..f9091d87 100644 --- a/tools/fastllm_pytools/llm.py +++ b/tools/fastllm_pytools/llm.py @@ -55,6 +55,8 @@ fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_float, ctypes.c_int] +fastllm_lib.set_special_tokens_llm_model.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p] + fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p] def set_cpu_threads(threads: int): @@ -145,7 +147,7 @@ def save(self, path : str): fastllm_lib.save_llm_model(self.model, path.encode()); def eval(self): - pass; + return self; def build_tokenizer_decode_token_cache(self): if self.tokenizer_decode_token_cache is not None: diff --git a/tools/fastllm_pytools/torch2flm.py b/tools/fastllm_pytools/torch2flm.py index 6a038d2c..f323f1b0 100644 --- a/tools/fastllm_pytools/torch2flm.py +++ b/tools/fastllm_pytools/torch2flm.py @@ -1,6 +1,9 @@ import struct +import builtins, os, json import numpy as np import torch +from transformers import PreTrainedTokenizerFast +from tokenizers.decoders import ByteLevel def writeString(fo, s): fo.write(struct.pack('i', len(s))) @@ -18,7 +21,8 @@ def writeKeyValue(fo, key, value): } fastllm_weight_type_dict = { "linear": 1, - "embedding": 2 + "embedding": 2, + "QuantizedLinear": 111 } v = np.random.randint(-127, 127, [10, 20]); @@ -73,12 +77,6 @@ def tofile(exportPath, print("dtype should be one of ", list(fastllm_data_type_dict.keys())) exit(0) - dict = model.state_dict() - fo = open(exportPath, "wb") - - # 0. version id - fo.write(struct.pack('i', 2)) - # 0.1 model info modelInfo = model.config.__dict__ if model.generation_config is not None: @@ -87,6 +85,11 @@ def tofile(exportPath, print("unknown model_type.") exit(0) + fo = open(exportPath, "wb") + + # 0. version id + fo.write(struct.pack('i', 2)) + if (pre_prompt is not None): modelInfo["pre_prompt"] = pre_prompt if (user_role is not None): @@ -108,7 +111,7 @@ def tofile(exportPath, modelInfo["user_role"] = (" ") if hasattr(model.generation_config, "user_token_id") else ""; modelInfo["bot_role"] = ("") if hasattr(model.generation_config, "assistant_token_id") else ""; modelInfo["history_sep"] = "" - if modelInfo["model_type"] == "qwen": + if (modelInfo["model_type"] == "qwen"): if modelInfo["chat_format"] == "chatml": modelInfo["im_end_id"] = tokenizer.im_end_id modelInfo["im_start_id"] = tokenizer.im_start_id @@ -119,7 +122,31 @@ def tofile(exportPath, modelInfo["bot_role"] = ("")) + ">"); modelInfo["history_sep"] = ""; - modelInfo["tokenizer_use_score"] = "1" # 分词带分数 + if tokenizer: + modelInfo["tokenizer_use_score"] = "1" # 分词带分数 + if len(tokenizer.all_special_tokens) > 0: + token_set = set() + for token in [tokenizer.bos_token, tokenizer.eos_token, tokenizer.unk_token, tokenizer.pad_token]: + for prompt in [pre_prompt, user_role, bot_role, history_sep]: + if prompt and str(token) in prompt: + modelInfo["tokenizer_has_special_tokens"] = "1" + token_set.add(str(token)) + if len(tokenizer.all_special_tokens) > len(token_set): + modelInfo["tokenizer_has_special_tokens"] = "1" + if hasattr(tokenizer, "sp_model") or (hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "sp_model")): + try: + import sentencepiece.sentencepiece_model_pb2 as model_pb2 + with open(tokenizer.vocab_file, "rb") as f: + sp_model_data = f.read() + sp_model_proto = model_pb2.ModelProto.FromString(sp_model_data) + modelInfo["tokenizer_add_dummy_prefix"] = sp_model_proto.normalizer_spec.add_dummy_prefix + modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces + except: + pass + elif isinstance(tokenizer, PreTrainedTokenizerFast): + if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \ + and isinstance(tokenizer._tokenizer.decoder, ByteLevel): + modelInfo["tokenizer_byte_as_char"] = True if hasattr(model, "peft_config"): adapter_size = len(model.peft_config) @@ -137,10 +164,12 @@ def tofile(exportPath, for it in adapter_dict.keys(): writeKeyValue(fo, str(it), str(adapter_dict[it])) + dict = model.state_dict() + # 1. vocab if (tokenizer): if (hasattr(tokenizer, "tokenizer")): - if (modelInfo['model_type'] == "qwen"): + if modelInfo["model_type"] == "qwen": pass else: tokenizer = tokenizer.tokenizer @@ -155,20 +184,36 @@ def tofile(exportPath, fo.write(struct.pack('i', i)) fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i)))) else: + merges = {} + if (modelInfo["model_type"] == "moss"): + merges = {("".join(bpe_tokens), token_index) for bpe_tokens, token_index in sorted(tokenizer.bpe_ranks.items(), key=lambda kv: kv[1])} + elif isinstance(tokenizer, PreTrainedTokenizerFast): + tokenizer_file = tokenizer.name_or_path + tokenizer.vocab_files_names['tokenizer_file'] + if os.path.exists(tokenizer_file): + with open(tokenizer_file, "r", encoding='utf-8') as f: + bpe_merges = json.load(f)["model"]["merges"] + bpe_merges = [pair.replace(" ", "") for pair in bpe_merges] + merges = builtins.dict(zip(bpe_merges, range(0, -len(bpe_merges), -1))) vocab = tokenizer.get_vocab() fo.write(struct.pack('i', len(vocab))) for v in vocab.keys(): - if (modelInfo['model_type'] == "qwen"): - s = v - elif (modelInfo["model_type"] == "moss"): + score = merges[v] if v in merges else 1.0 + if (modelInfo["model_type"] == "moss"): s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v] + elif (modelInfo["model_type"] == "qwen"): + s = v else: s = v.encode() fo.write(struct.pack('i', len(s))) for c in s: fo.write(struct.pack('i', c)) fo.write(struct.pack('i', vocab[v])) - fo.write(struct.pack('f', 1.0)) + fo.write(struct.pack('f', score)) + if ("tokenizer_has_special_tokens" in modelInfo): + fo.write(struct.pack('i', len(tokenizer.all_special_tokens))) + for special_token in tokenizer.all_special_tokens: + fo.write(struct.pack('i', len(special_token))) + fo.write(special_token.encode()) else: fo.write(struct.pack('i', 0)) @@ -191,6 +236,7 @@ def tofile(exportPath, if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict): cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]] to_data_type = 0 + if (cur_weight_type == 1): to_data_type = fastllm_data_type_dict[dtype] if (to_data_type == 7): @@ -199,13 +245,11 @@ def tofile(exportPath, cur = dict[key].numpy().astype(ori_np_data_type) + weight_name = key if hasattr(model, "peft_config"): - weight_name = key.replace('base_model.model.', '') - fo.write(struct.pack('i', len(weight_name))) - fo.write(weight_name.encode()) - else: - fo.write(struct.pack('i', len(key))) - fo.write(key.encode()) + weight_name = weight_name.replace('base_model.model.', '') + fo.write(struct.pack('i', len(weight_name))) + fo.write(weight_name.encode()) fo.write(struct.pack('i', len(cur.shape))) for i in cur.shape: fo.write(struct.pack('i', i)) diff --git a/tools/scripts/alpaca2flm.py b/tools/scripts/alpaca2flm.py index c8b473d2..e8103461 100644 --- a/tools/scripts/alpaca2flm.py +++ b/tools/scripts/alpaca2flm.py @@ -1,11 +1,13 @@ import sys -from transformers import LlamaTokenizer, LlamaForCausalLM +import torch +from transformers import AutoTokenizer, LlamaForCausalLM from fastllm_pytools import torch2flm if __name__ == "__main__": model_name = sys.argv[3] if len(sys.argv) >= 4 else 'minlik/chinese-alpaca-33b-merged' - tokenizer = LlamaTokenizer.from_pretrained(model_name) - model = LlamaForCausalLM.from_pretrained(model_name).float() + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + # `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32. + model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) conf = model.config.__dict__ conf["model_type"] = "llama" dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16" diff --git a/tools/scripts/internlm2flm.py b/tools/scripts/internlm2flm.py new file mode 100755 index 00000000..a725a2a3 --- /dev/null +++ b/tools/scripts/internlm2flm.py @@ -0,0 +1,16 @@ +import sys +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +from fastllm_pytools import torch2flm + +if __name__ == "__main__": + modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else "internlm/internlm-chat-7b-v1_1" + tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, trust_remote_code=True); + # `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32. + model = AutoModelForCausalLM.from_pretrained(modelNameOrPath, trust_remote_code=True, torch_dtype=torch.float16) + model = model.eval() + dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16" + exportPath = sys.argv[1] if len(sys.argv) >= 2 else "internlm-7b-' + dtype + '.flm" + torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", + user_role = "<|User|>:", bot_role = "\n<|Bot|>:", + history_sep = "\n", dtype = dtype) diff --git a/tools/src/pytools.cpp b/tools/src/pytools.cpp index 7e4f4b41..795d3112 100644 --- a/tools/src/pytools.cpp +++ b/tools/src/pytools.cpp @@ -117,6 +117,21 @@ extern "C" { return; } + DLL_EXPORT void set_special_tokens_llm_model(int modelId, int token_cnt, int *lens, char *tokens, int *ids) { + std::map tokenMap; + int cur = 0; + for (int i = 0; i < token_cnt; i++) { + std::string key = ""; + for (int j = 0; j < lens[i]; j++) { + key += tokens[cur++]; + } + tokenMap[key] = ids[i]; + } + auto model = models.GetModel(modelId); + model->weight.tokenizer.SetSpecialTokens(tokenMap); + return; + } + DLL_EXPORT int token_decode(int modelId, int tokenId, int output_buffer_len, char *output_buffer) { // 正常时候返回0,输出buffer长度不足时返回输出的bytes数量,包含末尾的\0 if(tokenId == -1) {