diff --git a/docs/llama_cookbook.md b/docs/llama_cookbook.md
index 3110653e..8c4c225e 100644
--- a/docs/llama_cookbook.md
+++ b/docs/llama_cookbook.md
@@ -8,7 +8,13 @@ LLaMA类模型有着基本相同的结构，但权重和prompt构造有差异。
 
 以下配置方案根据模型的源代码整理，不保证模型推理结果与原版完全一致。
 
-## 修改脚本并转换
+## 修改方式
+
+目前，转换脚本和两行加速方式均可用于llama类模型。但无论采用哪一种方式，都需要预留足够的内存（可以用swap空间）。
+
+在float16模式下，转换时约需要4×参数量+1GB的空闲内存。
+
+### 转换脚本
 
 这里以支持推理各类Llama结构的基座模型为例，介绍如何应用本文档。
 
@@ -40,17 +46,36 @@ LLaMA类模型有着基本相同的结构，但权重和prompt构造有差异。
 
 如需添加Token ID而非字符串（类似baichuan-chat模型），可以使用“<FLM_FIX_TOKEN_{ID}>”的格式添加。
 
+* 执行脚本
+
+```shell
+python3 tools/alpaca2flm.py [输出文件名] [精度] [原始模型名称或路径]
+```
+
 ### 两行加速
 
 ```python
+    conf = model.config.__dict__
+    conf["model_type"] = "llama"
     llm.from_hf(model, tokenizer, pre_prompt = "", 
                 user_role = "", bot_role = "", history_sep = "", 
                 dtype = dtype)
 ```
 
+## 对齐
+
+如果想使fastllm模型和原版transformers模型基本一致，最主要的操作是对齐tokenizer。
+如果模型使用了huggingface 加速版本的Tokenizers（即模型目录中包含`tokenizer.json`并优先使用），目前的转换脚本**仅在从本地文件转换时，能够对齐tokenizer**。
+
+注意检查原始tokenizer的`encode()`方法返回的结果前面是否会加空格。如果原始tokenizer没有加空格，则需要设置：
+
+```python
+    conf["tokenizer_add_dummy_prefix"] = False
+```
+
 ## Base Model
 
-见上方“[修改方案](#修改方案)”。
+见上方“[修改方案](#修改方式)”。
 
 一部分模型需要制定bos_token_id，假设bos_token_id为1则可以配置如下：
 
@@ -66,6 +91,8 @@ LLaMA类模型有着基本相同的结构，但权重和prompt构造有差异。
 
 ### InternLM（书生）
 
+* internlm/[internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)
+* internlm/[internlm-chat-7b v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1_1)
 * internlm/[internlm-chat-20b](https://huggingface.co/internlm/internlm-chat-20b)
 
 ```python
@@ -76,6 +103,15 @@ LLaMA类模型有着基本相同的结构，但权重和prompt构造有差异。
                      history_sep = "<eoa>\n<s>", dtype = dtype)
 ```
 
+可以直接使用`internlm2flm.py`脚本转换：
+
+``` sh
+cd build
+python3 tools/internlm2flm.py internlm-7b-fp16.flm float16 #导出float16模型
+python3 tools/internlm2flm.py internlm-7b-int8.flm int8 #导出int8模型
+python3 tools/internlm2flm.py internlm-7b-int4.flm int4 #导出int4模型
+python3 tools/internlm2flm.py internlm-7b-int4.flm float16 internlm/internlm-chat-7b #导出internlm-chat-7b float16模型
+```
 
 ### XVERSE
 
@@ -85,10 +121,12 @@ LLaMA类模型有着基本相同的结构，但权重和prompt构造有差异。
 ```python
     conf = model.config.__dict__
     conf["model_type"] = "llama"
+    conf["tokenizer_add_dummy_prefix"] = False
     torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", 
                      user_role = "Human: ", bot_role = "\n\nAssistant: ", 
                      history_sep = "<FLM_FIX_TOKEN_3>", dtype = dtype)
 ```
+XVERSE-13B-Chat V1 版本需要对输入做NFKC规范化，fastllm暂不支持，因此需要使用原始tokenizer. 
 
 ### 其他 llama1 系列
 
@@ -163,7 +201,7 @@ LLaMA类模型有着基本相同的结构，但权重和prompt构造有差异。
 ```python
     torch2flm.tofile(exportPath, model, tokenizer, 
                      pre_prompt="The following is a conversation between a human and an AI assistant namely YuLan, developed by GSAI, Renmin University of China. " \
-                                "The AI assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+                                "The AI assistant gives helpful, detailed, and polite answers to the user's questions.\n",
                      user_role="[|Human|]:", bot_role="\n[|AI|]:", history_sep="\n", dtype=dtype)
 ```
 
@@ -174,7 +212,7 @@ LLaMA类模型有着基本相同的结构，但权重和prompt构造有差异。
 
 ```python
     torch2flm.tofile(exportPath, model, tokenizer, 
-                     pre_prompt="Below is an instruction that describes a task. "
-                                "Write a response that appropriately completes the request.\n\n"
+                     pre_prompt="Below is an instruction that describes a task. " \
+                                "Write a response that appropriately completes the request.\n\n",
                      user_role="### Instruction:\n", bot_role="\n\n### Response:", history_sep="\n", dtype=dtype)
 ```
diff --git a/include/fastllm.h b/include/fastllm.h
index d945dcb5..6edab351 100644
--- a/include/fastllm.h
+++ b/include/fastllm.h
@@ -17,6 +17,8 @@
 #include <iostream>
 #include <functional>
 #include <memory>
+#include <locale>
+#include <codecvt>
 #include "devices/cpu/cputhreadpool.h"
 
 #ifdef USE_SENTENCEPIECE
@@ -43,7 +45,7 @@ namespace fastllm {
         float top_p = 1.0; // top_p采样
         float temperature = 1.0; // 温度参数，一般在0.1 ~ 1.0之间，设大这个参数可以带来结果的多样性
         bool output_logits = false; // 是否返回logits
-		bool enable_hash_id = false; // 给会话添加hash id
+        bool enable_hash_id = false; // 给会话添加hash id
         std::multiset <int> stop_token_ids;
 
         bool IsSimpleGreedy() const {
@@ -359,11 +361,22 @@ namespace fastllm {
 
         TrieNode *root;
 
+        TrieNode *specialRoot = nullptr;
+
         TokenizerType type = TokenizerType::BPE;
 
+        bool addDummyPrefix = true;   // 是否在首位添加空格
+        bool removeExtraWhitespaces = true;   // 是否将多个空格合并为一个
+        bool byteAsChar = false;  // 是否将byte变为展示字符
+
         std::unordered_map <int, std::string> tokenToStringDict;
         std::unordered_map <int, float> tokenToScoreDict;
         std::unordered_map <std::string, int> stringToTokenDict;
+        std::vector <std::string> specialTokens;
+
+        std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+        std::unordered_map <wchar_t, wchar_t> byteCharDict;
+        std::unordered_map <wchar_t, wchar_t> charByteDict;
 #ifdef USE_SENTENCEPIECE
         std::unique_ptr<sentencepiece::SentencePieceProcessor> spProcessor;
 #endif
@@ -380,6 +393,10 @@ namespace fastllm {
 
         void Insert(const std::string &s, int tokenId, float score = 1.0f); // 插入一个token
 
+        void SetSpecialTokens(const std::map <std::string, int> &specialTokens); // 设置需要优先处理的特殊token
+
+        std::string Normalize(const std::string &ori); // 字符规范化
+
         Data Encode(const std::string &s); // 编码
 
         std::string Decode(const Data &data); // 解码
diff --git a/src/fastllm.cpp b/src/fastllm.cpp
index 4572a2f2..be044e55 100644
--- a/src/fastllm.cpp
+++ b/src/fastllm.cpp
@@ -783,6 +783,18 @@ namespace fastllm {
 
     Tokenizer::Tokenizer() {
         root = new TrieNode();
+        int n = 0;
+        wchar_t special_token = L'\x0';
+        for (; special_token < L'!'; special_token++, n++) {
+            byteCharDict[L'\x100' + n] = special_token;
+            charByteDict[special_token] = L'\x100' + n;
+        }
+        for (special_token = L'\x7F'; special_token < L'\xA1'; special_token++, n++) {
+            byteCharDict[L'\x100' + n] = special_token;
+            charByteDict[special_token] = L'\x100' + n;
+        }
+        byteCharDict[L'\x100' + n++] = L'\xAD';
+        charByteDict[L'\xAD'] = L'\x100' + n++;
     }
 
     Tokenizer::~Tokenizer() {
@@ -799,8 +811,23 @@ namespace fastllm {
                 q.push_back(it.second);
             }
         }
+        if (specialRoot != nullptr) {
+            q.push_back(specialRoot);
+            for (int i = q.size() - 1; i < q.size(); i++) {
+                TrieNode *now = q[i];
+                for (auto it : now->next) {
+                    q.push_back(it.second);
+                }
+            }
+        }
+        for (TrieNode * node : q)
+            delete node;
+        q.clear();
         root = new TrieNode();
+        specialRoot = nullptr;
         tokenToStringDict.clear();
+        tokenToScoreDict.clear();
+        stringToTokenDict.clear();
     }
 
     void Tokenizer::Insert(const std::string &s, int tokenId, float score) {
@@ -818,6 +845,25 @@ namespace fastllm {
         stringToTokenDict[s] = tokenId;
     }
 
+    void Tokenizer::SetSpecialTokens(const std::map<std::string, int>& specialTokenMap) {
+        if (specialRoot == nullptr)
+            specialRoot = new TrieNode();
+        for (auto &it : specialTokenMap) {
+            TrieNode *now = this->specialRoot;
+            for (int i = 0; i < it.first.size(); i++) {
+                if (now->next.find(it.first[i]) == now->next.end()) {
+                    now->next[it.first[i]] = new TrieNode();
+                }
+                now = now->next[it.first[i]];
+            }
+            now->tokenId = it.second;
+            now->score = 0.0f;
+            tokenToStringDict[it.second] = it.first;
+            stringToTokenDict[it.first] = it.second;
+            specialTokens.push_back(it.first);
+        }
+    }
+
     void Tokenizer::TryMergePairs(std::vector<Symbol> &symbols, int l, int r, std::priority_queue <SymbolPairs> &q) {
         if (l == -1 || r == -1 || symbols[l].len == 0 || symbols[r].len == 0) {
             return;
@@ -850,24 +896,39 @@ namespace fastllm {
         return std::numeric_limits<int>::max();
     }
 
-    Data Tokenizer::Encode(const std::string &ori) {
-        if (this->type == TokenizerType::BPE) {
-            std::string blank = "";
-            blank += 226, blank += 150, blank += 129;
-            std::string s = blank;
-            if (15 < ori.size() && ori.substr(0, 15) == "<FLM_FIX_TOKEN_") {
-                s = "";
+    std::string Tokenizer::Normalize(const std::string &ori) {
+        if (this->byteAsChar) {
+            std::wstring ws(ori.size(), L' ');
+            for (int i=0; i < ori.length(); i++) {
+                wchar_t wi = static_cast<wchar_t>(static_cast<unsigned char>(ori[i]));
+                if (charByteDict.find(wi) != charByteDict.end()) {
+                    wi = charByteDict[wi];
+                }
+                ws[i] = wi;
             }
-            for (int i = 0; i < ori.size(); i++) {
-                if (ori[i] == ' ') {
-                    // if (i != 0 && ori[i - 1] != ' ') {
-                        // s += blank;
-                    // }
+            return converter.to_bytes(ws);
+        }
+        std::string blank = "";
+        blank += 226, blank += 150, blank += 129;
+        std::string s = this->addDummyPrefix ? blank : "";
+        if (15 < ori.size() && ori.substr(0, 15) == "<FLM_FIX_TOKEN_") {
+            s = "";
+        }
+        for (int i = 0; i < ori.size(); i++) {
+            if (ori[i] == ' ') {
+                if (!(this->removeExtraWhitespaces && i > 0 && ori[i - 1] == ' ')) {
                     s += blank;
-                } else {
-                    s += ori[i];
                 }
+            } else {
+                s += ori[i];
             }
+        }
+        return s;
+    }
+
+    Data Tokenizer::Encode(const std::string &ori) {
+        if (this->type == TokenizerType::BPE) {
+            std::string s = Normalize(ori);
 
             std::vector<Symbol> symbols;
             for (int i = 0; i < s.size(); i++) {
@@ -885,6 +946,22 @@ namespace fastllm {
                     }
                 }
 
+                if (this->specialRoot != nullptr) {
+                    TrieNode *now = this->specialRoot;
+                    int next = i;
+                    for (; next < s.size(); next++) {
+                        if (now->next.find(s[next]) == now->next.end())
+                            break;
+                        now = now->next[s[next]];
+                    }
+                    if (now->tokenId != -999999 && next > i) {
+                        symbols.push_back(Symbol(nullptr, (char *)s.data(), i, 0, (int) symbols.size() - 1,
+                                          (int) symbols.size() + 1, now->tokenId));
+                        i = next - 1;
+                        continue;
+                    }
+                }
+
                 int tokenId = -999999, pos = i - 1;
                 TrieNode *now = this->root;
                 for (int j = i; j < s.size(); j++) {
@@ -956,52 +1033,41 @@ namespace fastllm {
                     }
                 }
             }
-            return Data (DataType::FLOAT32, {1, (int)v.size()}, v);
+            return Data(DataType::FLOAT32, {1, (int)v.size()}, v);
         } else if (this->type == TokenizerType::GLM) {
             const std::map<std::string, int> specialTokens = {{"[MASK]", 50003}, {"[sMASK]", 50008}, {"[gMASK]", 50009}};
-            std::string blank = "";
-            blank += 226, blank += 150, blank += 129;
-            std::string s = blank;
-            for (int i = 0; i < ori.size(); i++) {
-                if (ori[i] == ' ') {
-                    if (i != 0 && ori[i - 1] != ' ') {
-                        s += blank;
-                    }
-                } else {
-                    s += ori[i];
-                }
-            }
+            std::string s = Normalize(ori);
             std::vector<float> v;
-            int findPos=0;
-            while(findPos<s.length()){
-                int nextSpecialToken=-1;
-                int nextSpecialTokenPos=-1;
-                int nextSpecialTokenLen=-1;
-                for(auto p:specialTokens){
-                    int ind=s.find(p.first,findPos);
-                    if(ind>=0&&(nextSpecialTokenPos<0||ind<nextSpecialTokenPos)){
-                        nextSpecialTokenPos=ind;
-                        nextSpecialToken=p.second;
-                        nextSpecialTokenLen=p.first.length();
+            int findPos = 0;
+            while (findPos < s.length()) {
+                int nextSpecialToken = -1;
+                int nextSpecialTokenPos = -1;
+                int nextSpecialTokenLen = -1;
+                for (auto p : specialTokens) {
+                    int ind = s.find(p.first, findPos);
+                    if (ind >= 0 && (nextSpecialTokenPos < 0 || ind < nextSpecialTokenPos)) {
+                        nextSpecialTokenPos = ind;
+                        nextSpecialToken = p.second;
+                        nextSpecialTokenLen = p.first.length();
                     }
                 }
                 std::string subStr;
-                if(nextSpecialTokenPos<0){
-                    subStr=s.substr(findPos);
-                    findPos=s.length();
-                }else{
-                    subStr=s.substr(findPos,nextSpecialTokenPos-findPos);
-                    findPos=nextSpecialTokenPos+nextSpecialTokenLen;
+                if (nextSpecialTokenPos < 0) {
+                    subStr = s.substr(findPos);
+                    findPos = s.length();
+                } else {
+                    subStr = s.substr(findPos, nextSpecialTokenPos - findPos);
+                    findPos = nextSpecialTokenPos + nextSpecialTokenLen;
                 }
-                if(subStr.length()>0){
+                if (subStr.length() > 0) {
 #ifdef USE_SENTENCEPIECE
-                    if(spProcessor!=nullptr){
+                    if (spProcessor!=nullptr) {
                         std::vector<int> ids;
-                        spProcessor->Encode(subStr,&ids);
-                        for(int id:ids){
+                        spProcessor->Encode(subStr, &ids);
+                        fo r(int id : ids) {
                             v.push_back(id);
                         }
-                    }else{
+                    } else {
 #endif
                     std::vector<Symbol> symbols;
                     for (int i = 0; i < subStr.size(); i++) {
@@ -1078,7 +1144,7 @@ namespace fastllm {
                     }
 #endif
                 }
-                if(nextSpecialTokenPos>=0){
+                if (nextSpecialTokenPos >= 0) {
                     v.push_back(nextSpecialToken);
                 }
             }
@@ -1239,6 +1305,17 @@ namespace fastllm {
                 ret.replace(pos, blank.length(), " ");
             else break;
         }
+        if (this->byteAsChar) {
+            std::wstring wret = converter.from_bytes(ret);
+            std::string decoded(wret.size(), ' ');
+            for (int i=0; i < wret.length(); i++) {
+                if (byteCharDict.find(wret[i]) != byteCharDict.end()) {
+                    wret[i] = byteCharDict[wret[i]];
+                }
+                decoded[i] = static_cast<char>(wret[i]);
+            }
+            ret = decoded;
+        }
         int pos = ret.find("<|blank_");
         if (pos != -1) {
             int space_num = atoi(ret.substr(8, ret.size() - 10).c_str());
@@ -1313,12 +1390,12 @@ namespace fastllm {
     }
 
     void WeightMap::LoadFromFile(const std::string &fileName) {
-    #ifdef USE_MMAP
+#ifdef USE_MMAP
         std::shared_ptr<FileMmap> mapped_file = std::make_shared<FileMmap>(fileName);
         ModelLoader buffer((char *)mapped_file->data, mapped_file->size);
-    #else
+#else
         FileBuffer buffer(fileName);
-    #endif
+#endif
         this->versionId = buffer.ReadInt();
 
         if (this->versionId >= 1) {
@@ -1348,7 +1425,8 @@ namespace fastllm {
             }
         }
 
-        bool useScore = this->dicts["tokenizer_use_score"] == "1";
+        bool useScore = this->dicts.find("tokenizer_use_score") != this->dicts.end()
+                && this->dicts["tokenizer_use_score"] == "1";
         int vocabLen = buffer.ReadInt();
         for (int i = 0; i < vocabLen; i++) {
             int len = buffer.ReadInt();
@@ -1360,6 +1438,18 @@ namespace fastllm {
             float score = useScore ? buffer.ReadFloat() : -i;
             tokenizer.Insert(x, id, score);
         }
+        bool hasSpecialTokens = this->dicts.find("tokenizer_has_special_tokens") != this->dicts.end()
+                && this->dicts["tokenizer_has_special_tokens"] == "1";
+        if (hasSpecialTokens) {
+            std::map <std::string, int> specialTokens;
+            int specialTokenLen = buffer.ReadInt();
+            for (int i = 0; i < specialTokenLen; i++) {
+                std::string token = buffer.ReadString();
+                int id = tokenizer.stringToTokenDict[token];
+                specialTokens[token] = id;
+            }
+            tokenizer.SetSpecialTokens(specialTokens);
+        }
 
         int len = buffer.ReadInt();
         for (int i = 0; i < len; i++) {
@@ -1377,10 +1467,10 @@ namespace fastllm {
             weight[name] = Data(dataType, dims);
 
             if (lowMemMode && this->embeddingNames.find(name) != this->embeddingNames.end()) {
-	            if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) {
-	            	weight[name].fileName = fileName;
+                if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) {
+                    weight[name].fileName = fileName;
 #if defined(_WIN32) or defined(_WIN64)
-	            	weight[name].filePos = _ftelli64(buffer.f);
+                    weight[name].filePos = _ftelli64(buffer.f);
 #else
 #ifdef USE_MMAP
                     weight[name].filePos =  buffer.tell();
@@ -1391,44 +1481,44 @@ namespace fastllm {
 #ifdef USE_MMAP
                     buffer.seek(weight[name].GetBytes(), SEEK_CUR);
 #else
-	            	fseek(buffer.f, weight[name].GetBytes(), SEEK_CUR);
+                    fseek(buffer.f, weight[name].GetBytes(), SEEK_CUR);
 #endif
-	            } else {
-	            	ErrorInFastLLM("Error: embedding's type should be float32 or bfloat16.\n");
-	            }
+                } else {
+                    ErrorInFastLLM("Error: embedding's type should be float32 or bfloat16.\n");
+                }
             } else {
 #ifdef USE_MMAP
                 weight[name].SetMapFile(mapped_file);
                 weight[name].expansionBytes = (weight[name].Count(0) * weight[name].unitSize - 1) / weight[name].unitSizeDiv + 1;
 #else
-	            weight[name].Allocate();
+                weight[name].Allocate();
 #endif
-	            if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) {
+                if (dataType == DataType::FLOAT32 || dataType == DataType::BFLOAT16 || dataType == DataType::FLOAT16) {
 #ifdef USE_MMAP
                     weight[name].cpuData = buffer.ReadBytes(weight[name].GetBytes());
 #else
                     buffer.ReadBytes(weight[name].cpuData, weight[name].GetBytes());
 #endif
-	            } else if (dataType == DataType::INT8 || dataType == DataType::INT4) {
-		            int bit = (dataType == DataType::INT4 ? 4 : 8);
-		            weight[name].perChannelAxis = buffer.ReadInt();
-		            int k = weight[name].perChannelAxis == -1 ? 1 : dims[weight[name].perChannelAxis];
-		            weight[name].perChannelsConfigs.resize(k);
-		            weight[name].zeros.resize(k);
-		            weight[name].scales.resize(k);
-		            for (int i = 0; i < k; i++) {
-			            float minValue = buffer.ReadFloat();
-			            float maxValue = buffer.ReadFloat();
-			            weight[name].perChannelsConfigs[i] = LowBitConfig(minValue, maxValue, bit, 0);
-			            weight[name].zeros[i] = weight[name].perChannelsConfigs[i].zeroPoint;
-			            weight[name].scales[i] = weight[name].perChannelsConfigs[i].scale;
-		            }
+                } else if (dataType == DataType::INT8 || dataType == DataType::INT4) {
+                    int bit = (dataType == DataType::INT4 ? 4 : 8);
+                    weight[name].perChannelAxis = buffer.ReadInt();
+                    int k = weight[name].perChannelAxis == -1 ? 1 : dims[weight[name].perChannelAxis];
+                    weight[name].perChannelsConfigs.resize(k);
+                    weight[name].zeros.resize(k);
+                    weight[name].scales.resize(k);
+                    for (int i = 0; i < k; i++) {
+                        float minValue = buffer.ReadFloat();
+                        float maxValue = buffer.ReadFloat();
+                        weight[name].perChannelsConfigs[i] = LowBitConfig(minValue, maxValue, bit, 0);
+                        weight[name].zeros[i] = weight[name].perChannelsConfigs[i].zeroPoint;
+                        weight[name].scales[i] = weight[name].perChannelsConfigs[i].scale;
+                    }
 #ifdef USE_MMAP
                     weight[name].cpuData = buffer.ReadBytes(weight[name].GetBytes());
 #else
                     buffer.ReadBytes(weight[name].cpuData, weight[name].GetBytes());
 #endif
-	            } else if (dataType == DataType::INT4_NOZERO) {
+                } else if (dataType == DataType::INT4_NOZERO) {
                     int bit = 4;
                     weight[name].perChannelAxis = buffer.ReadInt();
                     int k = weight[name].perChannelAxis == -1 ? 1 : dims[weight[name].perChannelAxis];
@@ -1442,11 +1532,11 @@ namespace fastllm {
                         weight[name].mins[i] = weight[name].perChannelsConfigs[i].min;
                         weight[name].scales[i] = weight[name].perChannelsConfigs[i].scale;
                     }
-#ifdef USE_MMAP
+    #ifdef USE_MMAP
                     weight[name].cpuData = buffer.ReadBytes(weight[name].GetBytes());
-#else
+    #else
                     buffer.ReadBytes(weight[name].cpuData, weight[name].GetBytes());
-#endif
+    #endif
                 }
             }
 
@@ -1502,7 +1592,8 @@ namespace fastllm {
         }
 
         // 写入词表
-        bool useScore = this->dicts["tokenizer_use_score"] == "1";
+        bool useScore = this->dicts.find("tokenizer_use_score") != this->dicts.end()
+                && this->dicts["tokenizer_use_score"] == "1";
         buffer.WriteInt((int)tokenizer.tokenToStringDict.size());
         for (auto &it : tokenizer.tokenToStringDict) {
             buffer.WriteInt((int)it.second.size());
@@ -1514,6 +1605,15 @@ namespace fastllm {
                 buffer.WriteFloat(tokenizer.tokenToScoreDict[it.first]);
             }
         }
+        bool hasSpecialTokens = this->dicts.find("tokenizer_has_special_tokens") != this->dicts.end()
+                && this->dicts["tokenizer_has_special_tokens"] == "1";
+        if (hasSpecialTokens) {
+            int specialTokenLen = tokenizer.specialTokens.size();
+            buffer.WriteInt(specialTokenLen);
+            for (int i = 0; i < specialTokenLen; i++) {
+                buffer.WriteString(tokenizer.specialTokens[i]);
+            }
+        }
 
         // 写入权重
         int need = 0;
diff --git a/src/model.cpp b/src/model.cpp
index 401e5ab9..5c919905 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -2,6 +2,7 @@
 
 #include "model.h"
 #include "fastllm.h"
+#include <sstream>
 
 #include "chatglm.h"
 #include "moss.h"
@@ -51,6 +52,24 @@ namespace fastllm {
         if (this->weight.dicts.find("history_sep") != this->weight.dicts.end()) {
             history_sep = this->weight.dicts["history_sep"];
         }
+        if (this->weight.dicts.find("tokenizer_add_dummy_prefix") != this->weight.dicts.end()) {
+            std::string value = this->weight.dicts["tokenizer_add_dummy_prefix"];
+            transform(value.begin(), value.end(), value.begin(), ::tolower);
+            std::istringstream iss(value);
+            iss >> std::boolalpha >> this->weight.tokenizer.addDummyPrefix;
+        }
+        if (this->weight.dicts.find("tokenizer_remove_extra_whitespaces") != this->weight.dicts.end()) {
+            std::string value = this->weight.dicts["tokenizer_remove_extra_whitespaces"];
+            transform(value.begin(), value.end(), value.begin(), ::tolower);
+            std::istringstream iss(value);
+            iss >> std::boolalpha >> this->weight.tokenizer.removeExtraWhitespaces;
+        }
+        if (this->weight.dicts.find("tokenizer_byte_as_char") != this->weight.dicts.end()) {
+            std::string value = this->weight.dicts["tokenizer_byte_as_char"];
+            transform(value.begin(), value.end(), value.begin(), ::tolower);
+            std::istringstream iss(value);
+            iss >> std::boolalpha >> this->weight.tokenizer.byteAsChar;
+        }
 
         this->deviceMap = GetDeviceMap();
     }
@@ -69,7 +88,7 @@ namespace fastllm {
             model = (basellm*)(new ChatGLMModel());
         } else if (modelType == "moss") {
             model = (basellm*)(new MOSSModel());
-            model->weight.tokenizer.type = Tokenizer::TokenizerType::NORMAL;
+            model->weight.tokenizer.type = Tokenizer::TokenizerType::BPE;
             model->eos_token_id = 106068;
         } else if (modelType == "baichuan") {
             model = (basellm*)(new LlamaModel());
@@ -79,6 +98,9 @@ namespace fastllm {
             model->bot_role = "\n<bot>:";
             model->history_sep = "\n";
             model->weight.tokenizer.type = Tokenizer::TokenizerType::BPE;
+        } else if (modelType == "internlm") {
+            model = new LlamaModel();
+            model->model_type = "internlm";
         } else if (modelType == "llama") {
             model = (basellm*)(new LlamaModel());
         } else if (modelType == "qwen") {
diff --git a/src/models/llama.cpp b/src/models/llama.cpp
index bf7f5294..07c4fbed 100644
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -105,10 +105,14 @@ namespace fastllm {
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
                     1e-6, attenInput);
             std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight";
+            std::string qBiasName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.bias";
             std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight";
+            std::string kBiasName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.bias";
             std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight";
+            std::string vBiasName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.bias";
             std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight";
             std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight";
+            std::string oBiasName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.bias";
 
             // 1.1 Get q, k, v
             int bsz = attenInput.dims[0], seqlen = attenInput.dims[1];
@@ -119,9 +123,12 @@ namespace fastllm {
                 Split(qkv, -1, per, per * 2, k);
                 Split(qkv, -1, per * 2, per * 3, v);
             } else {
-                Linear(attenInput, weight[qWeightName], Data(), q);
-                Linear(attenInput, weight[kWeightName], Data(), k);
-                Linear(attenInput, weight[vWeightName], Data(), v);
+                Data qBias = (weight.weight.find(qBiasName) != weight.weight.end()) ? weight[qBiasName] : Data();
+                Data kBias = (weight.weight.find(kBiasName) != weight.weight.end()) ? weight[kBiasName] : Data();
+                Data vBias = (weight.weight.find(vBiasName) != weight.weight.end()) ? weight[vBiasName] : Data();
+                Linear(attenInput, weight[qWeightName], qBias, q);
+                Linear(attenInput, weight[kWeightName], kBias, k);
+                Linear(attenInput, weight[vWeightName], vBias, v);
             }
 
             std::vector <int> qkvSize = {bsz, seqlen, num_attention_heads, -1};
@@ -198,7 +205,8 @@ namespace fastllm {
             PermuteSelf(attenOutput, {1, 0, 2});
             attenOutput.Reshape({bsz, seqlen, -1});
 
-            Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
+            Data oBias = (weight.weight.find(oBiasName) != weight.weight.end()) ? weight[oBiasName] : Data();
+            Linear(attenOutput, weight[oWeightName], oBias, attenLastOutput);
             AddTo(hiddenStates, attenLastOutput);
             // 2. mlp
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput);
@@ -267,10 +275,14 @@ namespace fastllm {
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
                     1e-6, attenInput);
             std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight";
+            std::string qBiasName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.bias";
             std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight";
+            std::string kBiasName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.bias";
             std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight";
+            std::string vBiasName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.bias";
             std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight";
             std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight";
+            std::string oBiasName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.bias";
 
             // 1.1 Get q, k, v
             int bsz = attenInput.dims[0], seqlen = attenInput.dims[1];
@@ -281,9 +293,12 @@ namespace fastllm {
                 Split(qkv, -1, per, per * 2, k);
                 Split(qkv, -1, per * 2, per * 3, v);
             } else {
-                Linear(attenInput, weight[qWeightName], Data(), q);
-                Linear(attenInput, weight[kWeightName], Data(), k);
-                Linear(attenInput, weight[vWeightName], Data(), v);
+                Data qBias = (weight.weight.find(qBiasName) != weight.weight.end()) ? weight[qBiasName] : Data();
+                Data kBias = (weight.weight.find(kBiasName) != weight.weight.end()) ? weight[kBiasName] : Data();
+                Data vBias = (weight.weight.find(vBiasName) != weight.weight.end()) ? weight[vBiasName] : Data();
+                Linear(attenInput, weight[qWeightName], qBias, q);
+                Linear(attenInput, weight[kWeightName], kBias, k);
+                Linear(attenInput, weight[vWeightName], vBias, v);
             }
 
             std::vector <int> qkvSize = {bsz, seqlen, num_attention_heads, -1};
@@ -363,7 +378,8 @@ namespace fastllm {
             attenOutput.Reshape({seqlen, bsz, -1});
             PermuteSelf(attenOutput, {1, 0, 2});
 
-            Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
+            Data oBias = (weight.weight.find(oBiasName) != weight.weight.end()) ? weight[oBiasName] : Data();
+            Linear(attenOutput, weight[oWeightName], oBias, attenLastOutput);
             AddTo(hiddenStates, attenLastOutput);
             // 2. mlp
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput);
@@ -437,10 +453,14 @@ namespace fastllm {
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
                     1e-6, attenInput);
             std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight";
+            std::string qBiasName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.bias";
             std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight";
+            std::string kBiasName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.bias";
             std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight";
+            std::string vBiasName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.bias";
             std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight";
             std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight";
+            std::string oBiasName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.bias";
 
             // 1.1 Get q, k, v
             int bsz = attenInput.dims[0], seqlen = attenInput.dims[1];
@@ -451,9 +471,12 @@ namespace fastllm {
                 Split(qkv, -1, per, per * 2, k);
                 Split(qkv, -1, per * 2, per * 3, v);
             } else {
-                Linear(attenInput, weight[qWeightName], Data(), q);
-                Linear(attenInput, weight[kWeightName], Data(), k);
-                Linear(attenInput, weight[vWeightName], Data(), v);
+                Data qBias = (weight.weight.find(qBiasName) != weight.weight.end()) ? weight[qBiasName] : Data();
+                Data kBias = (weight.weight.find(kBiasName) != weight.weight.end()) ? weight[kBiasName] : Data();
+                Data vBias = (weight.weight.find(vBiasName) != weight.weight.end()) ? weight[vBiasName] : Data();
+                Linear(attenInput, weight[qWeightName], qBias, q);
+                Linear(attenInput, weight[kWeightName], kBias, k);
+                Linear(attenInput, weight[vWeightName], vBias, v);
             }
 
             Data attenOutput = Data(DataType::FLOAT32);
@@ -556,7 +579,8 @@ namespace fastllm {
                 CatDirect(attenOutput, curAttenOutput, 1);
             }
 
-            Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
+            Data oBias = (weight.weight.find(oBiasName) != weight.weight.end()) ? weight[oBiasName] : Data();
+            Linear(attenOutput, weight[oWeightName], oBias, attenLastOutput);
             AddTo(hiddenStates, attenLastOutput);
             // 2. mlp
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput);
@@ -600,9 +624,9 @@ namespace fastllm {
 #endif
 //auto st = std::chrono::system_clock::now();
 #ifdef PY_API
-		size_t pos = input.rfind("time_stamp:");
-		std::string prompt = (generationConfig.enable_hash_id && pos != -1)?  input.substr(0, pos):input;
-		size_t hash_id = std::hash<std::string>{}(input);
+        size_t pos = input.rfind("time_stamp:");
+        std::string prompt = (generationConfig.enable_hash_id && pos != -1)?  input.substr(0, pos):input;
+        size_t hash_id = std::hash<std::string>{}(input);
         Data inputIds = this->weight.tokenizer.Encode(prompt);
 #else
         Data inputIds = this->weight.tokenizer.Encode(input);
diff --git a/src/pybinding.cpp b/src/pybinding.cpp
index 5452656f..41f5ff20 100644
--- a/src/pybinding.cpp
+++ b/src/pybinding.cpp
@@ -260,6 +260,9 @@ PYBIND11_MODULE(pyfastllm, m) {
 
 
   py::class_<fastllm::Tokenizer>(m, "Tokenizer")
+    .def_readonly("add_dummy_prefix", &fastllm::Tokenizer::addDummyPrefix)
+    .def_readonly("remove_extra_whitespaces", &fastllm::Tokenizer::removeExtraWhitespaces)
+    .def_readonly("byte_as_char", &fastllm::Tokenizer::byteAsChar)
     .def("encode", &fastllm::Tokenizer::Encode)
     // .def("decode", &fastllm::Tokenizer::Decode)
     .def("decode", &fastllm::Tokenizer::Decode, "Decode from Tensor")
@@ -273,7 +276,8 @@ PYBIND11_MODULE(pyfastllm, m) {
       return py::bytes(ret);
     })
     .def("clear", &fastllm::Tokenizer::Clear)
-    .def("insert", &fastllm::Tokenizer::Insert);
+    .def("insert", &fastllm::Tokenizer::Insert)
+    .def("set_special_tokens", &fastllm::Tokenizer::SetSpecialTokens);
   
   py::class_<fastllm::WeightMap>(m, "WeightMap")
     .def_readonly("tokenizer", &fastllm::WeightMap::tokenizer)
diff --git a/tools/fastllm_pytools/hf_model.py b/tools/fastllm_pytools/hf_model.py
index 95fddd2d..e50e8cb3 100644
--- a/tools/fastllm_pytools/hf_model.py
+++ b/tools/fastllm_pytools/hf_model.py
@@ -1,12 +1,16 @@
 from fastllm_pytools import llm;
-import torch;
 import ctypes;
-import numpy as np;
+import builtins, os, json
+import numpy as np
+import torch
+from transformers import PreTrainedTokenizerFast
+from tokenizers.decoders import ByteLevel
 
 fastllm_data_type_dict = {
     "int4": 8,
     "int8": 3,
-    "float16": 7
+    "float16": 7,
+    "float32": 0,
 }
 fastllm_weight_type_dict = {
     "linear": 1,
@@ -49,7 +53,7 @@ def create(model,
             # Baichuan-13B-chat
             modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else "";
         modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
-        modelInfo["history_sep"] = "";
+        modelInfo["history_sep"] = ""
     if (modelInfo["model_type"] == "qwen"):
         if modelInfo["chat_format"] == "chatml":
             modelInfo["im_end_id"] = tokenizer.im_end_id
@@ -61,20 +65,31 @@ def create(model,
         modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|assistant|>")) + ">");
         modelInfo["history_sep"] = "";
 
-    modelInfo["tokenizer_use_score"] = "1" # 分词带分数
-
-    weight_type_dict = {};
-    module_dict = {};
-    weight_bits = {};
-    for key, m in model.named_modules():
-        if (str(type(m)).find("QuantizedLinear") != -1):
-            weight_type_dict[key + ".weight"] = "QuantizedLinear";
-            weight_bits[key + ".weight"] = m.weight_bit_width;
-        if (isinstance(m, torch.nn.Linear)):
-            weight_type_dict[key + ".weight"] = "linear";
-            module_dict[key + ".weight"] = m;
-        if (isinstance(m, torch.nn.Embedding)):
-            weight_type_dict[key] = "embedding";
+    if tokenizer:
+        modelInfo["tokenizer_use_score"] = "1" # 分词带分数
+        if len(tokenizer.all_special_tokens) > 0:
+            token_set = set()
+            for token in [tokenizer.bos_token, tokenizer.eos_token, tokenizer.unk_token, tokenizer.pad_token]:
+                for prompt in [pre_prompt, user_role, bot_role, history_sep]:
+                    if prompt and str(token) in prompt:
+                        modelInfo["tokenizer_has_special_tokens"] = "1"
+                token_set.add(str(token))
+            if len(tokenizer.all_special_tokens) > len(token_set):
+                modelInfo["tokenizer_has_special_tokens"] = "1"
+        if hasattr(tokenizer, "sp_model") or (hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "sp_model")):
+            try:
+                import sentencepiece.sentencepiece_model_pb2 as model_pb2
+                with open(tokenizer.vocab_file, "rb") as f:
+                    sp_model_data = f.read()
+                    sp_model_proto = model_pb2.ModelProto.FromString(sp_model_data)
+                    modelInfo["tokenizer_add_dummy_prefix"] = sp_model_proto.normalizer_spec.add_dummy_prefix
+                    modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces
+            except:
+                pass
+        elif isinstance(tokenizer, PreTrainedTokenizerFast):
+            if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \
+                    and isinstance(tokenizer._tokenizer.decoder, ByteLevel):
+                modelInfo["tokenizer_byte_as_char"] = True
 
     peft_config = {}
     active_adapter = ""
@@ -85,7 +100,7 @@ def create(model,
         active_adapter = model.active_adapter
 
     model = model.cpu();
-    dict = model.state_dict();
+    dict = model.state_dict()
 
     if (modelInfo["model_type"] == "baichuan" and modelInfo["vocab_size"] == 125696):
         # normalize lm_head for Baichuan 2
@@ -94,16 +109,16 @@ def create(model,
         model.load_state_dict(dict)
 
     model_type = modelInfo["model_type"];
-    model = llm.fastllm_lib.create_empty_llm_model(model_type.encode());
+    model_handle = llm.fastllm_lib.create_empty_llm_model(model_type.encode());
     for it in modelInfo.keys():
-        llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode());
+        llm.fastllm_lib.add_dict_llm_model(model_handle, str(it).encode(), str(modelInfo[it]).encode());
 
     for adapter_name in peft_config.keys():
         adapter_dict = peft_config[adapter_name].__dict__
         for it in adapter_dict.keys():
-            llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode())
+            llm.fastllm_lib.add_adapter_dict_llm_model(model_handle, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode())
     if len(active_adapter) != 0:
-        llm.fastllm_lib.set_adapter(model, str(active_adapter).encode())
+        llm.fastllm_lib.set_adapter(model_handle, str(active_adapter).encode())
 
     # 1. vocab
     if (tokenizer):
@@ -111,62 +126,97 @@ def create(model,
             if modelInfo["model_type"] == "qwen":
                 pass
             else:
-                tokenizer = tokenizer.tokenizer;
+                tokenizer = tokenizer.tokenizer
         if (hasattr(tokenizer, "sp_model")):
-            piece_size = tokenizer.sp_model.piece_size();
+            piece_size = tokenizer.sp_model.piece_size()
             for i in range(piece_size):
-                llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
+                llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, tokenizer.sp_model.id_to_piece(i).encode(),
                                                              i, ctypes.c_float(tokenizer.sp_model.get_score(i)));
         else:
-            vocab = tokenizer.get_vocab();
+            merges = {}
+            if (modelInfo["model_type"] == "moss"):
+                merges = {("".join(bpe_tokens), token_index) for bpe_tokens, token_index in sorted(tokenizer.bpe_ranks.items(), key=lambda kv: kv[1])}
+            elif isinstance(tokenizer, PreTrainedTokenizerFast):
+                tokenizer_file = tokenizer.name_or_path + tokenizer.vocab_files_names['tokenizer_file']
+                if os.path.exists(tokenizer_file):
+                    with open(tokenizer_file, "r", encoding='utf-8') as f:
+                        bpe_merges = json.load(f)["model"]["merges"]
+                        bpe_merges = [pair.replace(" ", "") for pair in bpe_merges]
+                        merges = builtins.dict(zip(bpe_merges, range(0, -len(bpe_merges), -1)))
+            vocab = tokenizer.get_vocab()
             for v in vocab.keys():
+                score = merges[v] if v in merges else 1.0
                 if (modelInfo["model_type"] == "moss"):
-                    vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v];
-                    llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0));
+                    s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
+                    llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, s, vocab[v], ctypes.c_float(score));
                 elif (modelInfo["model_type"] == "qwen"):
-                    llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0));
+                    llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, v, vocab[v], ctypes.c_float(1.0));
                 else:
-                    llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0));
-    tot = 0;
+                    llm.fastllm_lib.add_tokenizer_word_llm_model(model_handle, v.encode(), vocab[v], ctypes.c_float(score));
+        if ("tokenizer_has_special_tokens" in modelInfo):
+            special_tokens_str = ''.join(tokenizer.all_special_tokens)
+            special_tokens_len = [len(x) for x in tokenizer.all_special_tokens]
+            special_tokens_ids = tokenizer.all_special_ids
+            llm.fastllm_lib.set_special_tokens_llm_model(model_handle, len(special_tokens_len),
+                                                         (ctypes.c_int * len(special_tokens_len))(*special_tokens_len),
+                                                         special_tokens_str.encode(),
+                                                         (ctypes.c_int * len(special_tokens_ids))(*special_tokens_ids));
+
+    weight_type_dict = {}
+    module_dict = {}
+    weight_bits = {}
+    for key, m in model.named_modules():
+        if (str(type(m)).find("QuantizedLinear") != -1):
+            weight_type_dict[key + ".weight"] = "QuantizedLinear";
+            weight_bits[key + ".weight"] = m.weight_bit_width;
+        if (isinstance(m, torch.nn.Linear)):
+            weight_type_dict[key + ".weight"] = "linear"
+            module_dict[key + ".weight"] = m
+        if (isinstance(m, torch.nn.Embedding)):
+            weight_type_dict[key] = "embedding"
+
+    # 2. weight
+    tot = 0
     for key in dict:
-        ori_data_type = 0;
-        ori_np_data_type = np.float32;
-        cur_weight_type = 0;
+        ori_data_type = 0
+        ori_np_data_type = np.float32
+        cur_weight_type = 0
         if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
-            cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]];
-        to_data_type = 0;
+            cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
+        to_data_type = 0
 
         if (cur_weight_type == 1):
-            to_data_type = fastllm_data_type_dict[dtype];
+            to_data_type = fastllm_data_type_dict[dtype]
             if (to_data_type == 7):
-                ori_data_type = 7;
-                ori_np_data_type = np.float16;
+                ori_data_type = 7
+                ori_np_data_type = np.float16
         elif (cur_weight_type == 2):
             # TODO bfloat
-            to_data_type = 0;
+            to_data_type = 0
 
         weight_name = key
-        if peft_config is not None:
+        if hasattr(model, "peft_config"):
             weight_name = weight_name.replace('base_model.model.', '')
         if (cur_weight_type == 111):
-            llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(),
+            llm.fastllm_lib.add_qlinear_weight_llm_model(model_handle, weight_name.encode(),
                                                  len(dict[key].shape),
                                                  (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
                                                  weight_bits[key],
                                                  dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
                                                  dict[key].numpy().ctypes.data_as(ctypes.c_void_p));
         else:
-            llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(),
+            llm.fastllm_lib.add_weight_llm_model(model_handle, weight_name.encode(),
                                              len(dict[key].shape),
                                              (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
                                              to_data_type, cur_weight_type, ori_data_type,
                                              dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p));
         tot += 1;
         print("convert (", tot, "/", len(dict), end = " )\r");
+        dict[key].to(torch.device("meta"))
 
     print("");
-    llm.fastllm_lib.init_params_llm_model(model);
-    llm.fastllm_lib.warmup_llm_model(model);
-    ret = llm.model("", id = model);
+    llm.fastllm_lib.init_params_llm_model(model_handle);
+    llm.fastllm_lib.warmup_llm_model(model_handle);
+    ret = llm.model("", id = model_handle);
     return ret;
 
diff --git a/tools/fastllm_pytools/llm.py b/tools/fastllm_pytools/llm.py
index 426f7aa0..f9091d87 100644
--- a/tools/fastllm_pytools/llm.py
+++ b/tools/fastllm_pytools/llm.py
@@ -55,6 +55,8 @@
 
 fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_float, ctypes.c_int]
 
+fastllm_lib.set_special_tokens_llm_model.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
+
 fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
 
 def set_cpu_threads(threads: int):
@@ -145,7 +147,7 @@ def save(self, path : str):
         fastllm_lib.save_llm_model(self.model, path.encode());
 
     def eval(self):
-        pass;
+        return self;
 
     def build_tokenizer_decode_token_cache(self):
         if self.tokenizer_decode_token_cache is not None:
diff --git a/tools/fastllm_pytools/torch2flm.py b/tools/fastllm_pytools/torch2flm.py
index 6a038d2c..f323f1b0 100644
--- a/tools/fastllm_pytools/torch2flm.py
+++ b/tools/fastllm_pytools/torch2flm.py
@@ -1,6 +1,9 @@
 import struct
+import builtins, os, json
 import numpy as np
 import torch
+from transformers import PreTrainedTokenizerFast
+from tokenizers.decoders import ByteLevel
 
 def writeString(fo, s):
     fo.write(struct.pack('i', len(s)))
@@ -18,7 +21,8 @@ def writeKeyValue(fo, key, value):
 }
 fastllm_weight_type_dict = {
     "linear": 1,
-    "embedding": 2
+    "embedding": 2,
+    "QuantizedLinear": 111
 }
 
 v = np.random.randint(-127, 127, [10, 20]);
@@ -73,12 +77,6 @@ def tofile(exportPath,
         print("dtype should be one of ", list(fastllm_data_type_dict.keys()))
         exit(0)
 
-    dict = model.state_dict()
-    fo = open(exportPath, "wb")
-
-    # 0. version id
-    fo.write(struct.pack('i', 2))
-
     # 0.1 model info
     modelInfo = model.config.__dict__
     if model.generation_config is not None:
@@ -87,6 +85,11 @@ def tofile(exportPath,
         print("unknown model_type.")
         exit(0)
 
+    fo = open(exportPath, "wb")
+
+    # 0. version id
+    fo.write(struct.pack('i', 2))
+
     if (pre_prompt is not None):
         modelInfo["pre_prompt"] = pre_prompt
     if (user_role is not None):
@@ -108,7 +111,7 @@ def tofile(exportPath,
             modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else "";
         modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
         modelInfo["history_sep"] = ""
-    if modelInfo["model_type"] == "qwen":
+    if (modelInfo["model_type"] == "qwen"):
         if modelInfo["chat_format"] == "chatml":
             modelInfo["im_end_id"] = tokenizer.im_end_id
             modelInfo["im_start_id"] = tokenizer.im_start_id
@@ -119,7 +122,31 @@ def tofile(exportPath,
         modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|assistant|>")) + ">");
         modelInfo["history_sep"] = "";
 
-    modelInfo["tokenizer_use_score"] = "1" # 分词带分数
+    if tokenizer:
+        modelInfo["tokenizer_use_score"] = "1" # 分词带分数
+        if len(tokenizer.all_special_tokens) > 0:
+            token_set = set()
+            for token in [tokenizer.bos_token, tokenizer.eos_token, tokenizer.unk_token, tokenizer.pad_token]:
+                for prompt in [pre_prompt, user_role, bot_role, history_sep]:
+                    if prompt and str(token) in prompt:
+                        modelInfo["tokenizer_has_special_tokens"] = "1"
+                token_set.add(str(token))
+            if len(tokenizer.all_special_tokens) > len(token_set):
+                modelInfo["tokenizer_has_special_tokens"] = "1"
+        if hasattr(tokenizer, "sp_model") or (hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "sp_model")):
+            try:
+                import sentencepiece.sentencepiece_model_pb2 as model_pb2
+                with open(tokenizer.vocab_file, "rb") as f:
+                    sp_model_data = f.read()
+                    sp_model_proto = model_pb2.ModelProto.FromString(sp_model_data)
+                    modelInfo["tokenizer_add_dummy_prefix"] = sp_model_proto.normalizer_spec.add_dummy_prefix
+                    modelInfo["tokenizer_remove_extra_whitespaces"] = sp_model_proto.normalizer_spec.remove_extra_whitespaces
+            except:
+                pass
+        elif isinstance(tokenizer, PreTrainedTokenizerFast):
+            if hasattr(tokenizer, "_tokenizer") and hasattr(tokenizer._tokenizer, "decoder") \
+                    and isinstance(tokenizer._tokenizer.decoder, ByteLevel):
+                modelInfo["tokenizer_byte_as_char"] = True
 
     if hasattr(model, "peft_config"):
         adapter_size = len(model.peft_config)
@@ -137,10 +164,12 @@ def tofile(exportPath,
             for it in adapter_dict.keys():
                 writeKeyValue(fo, str(it), str(adapter_dict[it]))
 
+    dict = model.state_dict()
+
     # 1. vocab
     if (tokenizer):
         if (hasattr(tokenizer, "tokenizer")):
-            if (modelInfo['model_type'] == "qwen"):
+            if modelInfo["model_type"] == "qwen":
                 pass
             else:
                 tokenizer = tokenizer.tokenizer
@@ -155,20 +184,36 @@ def tofile(exportPath,
                 fo.write(struct.pack('i', i))
                 fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
         else:
+            merges = {}
+            if (modelInfo["model_type"] == "moss"):
+                merges = {("".join(bpe_tokens), token_index) for bpe_tokens, token_index in sorted(tokenizer.bpe_ranks.items(), key=lambda kv: kv[1])}
+            elif isinstance(tokenizer, PreTrainedTokenizerFast):
+                tokenizer_file = tokenizer.name_or_path + tokenizer.vocab_files_names['tokenizer_file']
+                if os.path.exists(tokenizer_file):
+                    with open(tokenizer_file, "r", encoding='utf-8') as f:
+                        bpe_merges = json.load(f)["model"]["merges"]
+                        bpe_merges = [pair.replace(" ", "") for pair in bpe_merges]
+                        merges = builtins.dict(zip(bpe_merges, range(0, -len(bpe_merges), -1)))
             vocab = tokenizer.get_vocab()
             fo.write(struct.pack('i', len(vocab)))
             for v in vocab.keys():
-                if (modelInfo['model_type'] == "qwen"):
-                    s = v
-                elif (modelInfo["model_type"] == "moss"):
+                score = merges[v] if v in merges else 1.0
+                if (modelInfo["model_type"] == "moss"):
                     s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
+                elif (modelInfo["model_type"] == "qwen"):
+                    s = v
                 else:
                     s = v.encode()
                 fo.write(struct.pack('i', len(s)))
                 for c in s:
                     fo.write(struct.pack('i', c))
                 fo.write(struct.pack('i', vocab[v]))
-                fo.write(struct.pack('f', 1.0))
+                fo.write(struct.pack('f', score))
+        if ("tokenizer_has_special_tokens" in modelInfo):
+            fo.write(struct.pack('i', len(tokenizer.all_special_tokens)))
+            for special_token in tokenizer.all_special_tokens:
+                fo.write(struct.pack('i', len(special_token)))
+                fo.write(special_token.encode())
     else:
         fo.write(struct.pack('i', 0))
 
@@ -191,6 +236,7 @@ def tofile(exportPath,
         if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
             cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
         to_data_type = 0
+
         if (cur_weight_type == 1):
             to_data_type = fastllm_data_type_dict[dtype]
             if (to_data_type == 7):
@@ -199,13 +245,11 @@ def tofile(exportPath,
 
         cur = dict[key].numpy().astype(ori_np_data_type)
         
+        weight_name = key
         if hasattr(model, "peft_config"):
-            weight_name = key.replace('base_model.model.', '')
-            fo.write(struct.pack('i', len(weight_name)))
-            fo.write(weight_name.encode())
-        else:
-            fo.write(struct.pack('i', len(key)))
-            fo.write(key.encode())
+            weight_name = weight_name.replace('base_model.model.', '')
+        fo.write(struct.pack('i', len(weight_name)))
+        fo.write(weight_name.encode())
         fo.write(struct.pack('i', len(cur.shape)))
         for i in cur.shape:
             fo.write(struct.pack('i', i))
diff --git a/tools/scripts/alpaca2flm.py b/tools/scripts/alpaca2flm.py
index c8b473d2..e8103461 100644
--- a/tools/scripts/alpaca2flm.py
+++ b/tools/scripts/alpaca2flm.py
@@ -1,11 +1,13 @@
 import sys
-from transformers import LlamaTokenizer, LlamaForCausalLM
+import torch
+from transformers import AutoTokenizer, LlamaForCausalLM
 from fastllm_pytools import torch2flm
 
 if __name__ == "__main__":
     model_name = sys.argv[3] if len(sys.argv) >= 4 else 'minlik/chinese-alpaca-33b-merged'
-    tokenizer = LlamaTokenizer.from_pretrained(model_name)
-    model = LlamaForCausalLM.from_pretrained(model_name).float()
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    # `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32.
+    model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
     conf = model.config.__dict__
     conf["model_type"] = "llama"
     dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
diff --git a/tools/scripts/internlm2flm.py b/tools/scripts/internlm2flm.py
new file mode 100755
index 00000000..a725a2a3
--- /dev/null
+++ b/tools/scripts/internlm2flm.py
@@ -0,0 +1,16 @@
+import sys
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from fastllm_pytools import torch2flm
+
+if __name__ == "__main__":
+    modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else "internlm/internlm-chat-7b-v1_1"
+    tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, trust_remote_code=True);
+    # `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32.
+    model = AutoModelForCausalLM.from_pretrained(modelNameOrPath, trust_remote_code=True, torch_dtype=torch.float16)
+    model = model.eval()
+    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
+    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "internlm-7b-' + dtype + '.flm"
+    torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "<s><s>", 
+                     user_role = "<|User|>:", bot_role = "<eoh>\n<|Bot|>:", 
+                     history_sep = "<eoa>\n<s>", dtype = dtype)
diff --git a/tools/src/pytools.cpp b/tools/src/pytools.cpp
index 7e4f4b41..795d3112 100644
--- a/tools/src/pytools.cpp
+++ b/tools/src/pytools.cpp
@@ -117,6 +117,21 @@ extern "C" {
         return;
     }
 
+    DLL_EXPORT void set_special_tokens_llm_model(int modelId, int token_cnt, int *lens, char *tokens, int *ids) {
+        std::map <std::string, int> tokenMap;
+        int cur = 0;
+        for (int i = 0; i < token_cnt; i++) {
+            std::string key = "";
+            for (int j = 0; j < lens[i]; j++) {
+                key += tokens[cur++];
+            }
+            tokenMap[key] = ids[i];
+        }
+        auto model = models.GetModel(modelId);
+        model->weight.tokenizer.SetSpecialTokens(tokenMap);
+        return;
+    }
+
     DLL_EXPORT int token_decode(int modelId, int tokenId, int output_buffer_len, char *output_buffer) {
         // 正常时候返回0，输出buffer长度不足时返回输出的bytes数量，包含末尾的\0
         if(tokenId == -1) {