Merge pull request ztxz16#425 from TylunasLi/llama

支持llama类模型的外推位置编码，增加Deepseek-Coder-Instruct和Qwen1.5-Chat模型支持
TylunasLi · Mar 1, 2024 · 1de512f · 1de512f
2 parents 8f77cba + 7e1d704
commit 1de512f
Show file tree

Hide file tree

Showing 14 changed files with 247 additions and 76 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ fastllm是纯c++实现，无第三方依赖的高性能大模型推理库
 - 🚀 支持流式输出，很方便实现打字机效果
 - 🚀 支持python调用
 - 🚀 前后端分离设计，便于支持新的计算设备
-- 🚀 目前支持ChatGLM模型，各种LLAMA模型(ALPACA, VICUNA等)，BAICHUAN模型，MOSS模型
+- 🚀 目前支持ChatGLM系列模型，各种LLAMA模型(ALPACA, VICUNA等)，BAICHUAN模型，QWEN模型，MOSS模型等
 
 ## 两行代码加速 （测试中，暂时只支持chatglm系列）
 
@@ -379,6 +379,17 @@ python3 tools/qwen2flm.py qwen-7b-int8.flm int8 #导出int8模型
 python3 tools/qwen2flm.py qwen-7b-int4.flm int4 #导出int4模型
 ```
 
+* **Qwen1.5**
+
+```sh
+# 需要先安装QWen2环境（transformers >= 4.37.0）
+# 根据所需的精度，导出相应的模型
+python3 tools/llamalike2flm.py qwen1.5-7b-fp16.flm float16 "qwen/Qwen1.5-4B-Chat" #导出wen1.5-4B-Chat float16模型
+python3 tools/llamalike2flm.py qwen1.5-7b-int8.flm int8 "qwen/Qwen1.5-7B-Chat" #导出Qwen1.5-7B-Chat int8模型
+python3 tools/llamalike2flm.py qwen1.5-7b-int4.flm int4 "qwen/Qwen1.5-14B-Chat" #导出Qwen1.5-14B-Chat int4模型
+# 最后一个参数可替换为模型路径
+```
+
 ## 开发计划
 
 也就是俗称的画饼部分，大家如果有需要的功能可以在讨论区提出

diff --git a/docs/llama_cookbook.md b/docs/llama_cookbook.md
@@ -103,14 +103,14 @@ python3 tools/alpaca2flm.py [输出文件名] [精度] [原始模型名称或路
                      history_sep = "<eoa>\n<s>", dtype = dtype)
 ```
 
-可以直接使用`internlm2flm.py`脚本转换：
+可以直接使用`llamalike2flm.py`脚本转换：
 
 ``` sh
 cd build
-python3 tools/internlm2flm.py internlm-7b-fp16.flm float16 #导出float16模型
-python3 tools/internlm2flm.py internlm-7b-int8.flm int8 #导出int8模型
-python3 tools/internlm2flm.py internlm-7b-int4.flm int4 #导出int4模型
-python3 tools/internlm2flm.py internlm-7b-int4.flm float16 internlm/internlm-chat-7b #导出internlm-chat-7b float16模型
+python3 tools/llamalike2flm.py internlm-7b-fp16.flm float16 internlm/internlm-chat-20b #导出float16模型
+python3 tools/llamalike2flm.py internlm-7b-int8.flm int8 internlm/internlm-chat-20b #导出int8模型
+python3 tools/llamalike2flm.py internlm-7b-int4.flm int4 internlm/internlm-chat-20b #导出int4模型
+python3 tools/llamalike2flm.py internlm-7b-int4.flm float16 internlm/internlm-chat-7b #导出internlm-chat-7b float16模型
 ```
 
 ### XVERSE
@@ -128,6 +128,21 @@ python3 tools/internlm2flm.py internlm-7b-int4.flm float16 internlm/internlm-cha
 ```
 XVERSE-13B-Chat V1 版本需要对输入做NFKC规范化，fastllm暂不支持，因此需要使用原始tokenizer. 
 
+* xverse/[XVERSE-13B-256K](https://huggingface.co/xverse/XVERSE-13B-256K)
+
+该模型没有将RoPE外推参数放到config中，因此需要手工指定：
+```python
+    conf = model.config.__dict__
+    conf["model_type"] = "llama"
+    conf["rope_theta"] = 500000
+    conf["rope_scaling.type"] = "dynamic"
+    conf["rope_scaling.factor"] = 2.0
+    conf["tokenizer_add_dummy_prefix"] = False
+    torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", 
+                     user_role = "Human: ", bot_role = "\n\nAssistant: ", 
+                     history_sep = "<FLM_FIX_TOKEN_3>", dtype = dtype)
+```
+
 ### 其他 llama1 系列
 
 * Vicuna v1.1 v1.3
@@ -154,6 +169,11 @@ XVERSE-13B-Chat V1 版本需要对输入做NFKC规范化，fastllm暂不支持
 |  7B | [meta-llama/Llama-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat) | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) |
 | 13B | [meta-llama/Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama-2-13b-chat) | [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) |
 
+|Model|  CodeLlama-Instruct                                                                               |
+|-----| ------------------------------------------------------------------------------------------------- |
+|  7B |  [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf)  |
+| 13B | [codellama/CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) |
+
 官方示例代码中，可以不用系统提示语：
 
 ```python
@@ -216,3 +236,17 @@ XVERSE-13B-Chat V1 版本需要对输入做NFKC规范化，fastllm暂不支持
                                 "Write a response that appropriately completes the request.\n\n",
                      user_role="### Instruction:\n", bot_role="\n\n### Response:", history_sep="\n", dtype=dtype)
 ```
+
+### Deepseek Coder
+
+  * [Deepseek-Coder-1.3B-Instruct](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-instruct)
+  * [Deepseek-Coder-6.7B-Instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)
+  * [Deepseek-Coder-7B-Instruct v1.5](https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5)
+
+```python
+    torch2flm.tofile(exportPath, model, tokenizer, 
+                     pre_prompt="<FLM_FIX_TOKEN_32013>	You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, " \
+                                "and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, " \
+                                "and other non-computer science questions, you will refuse to answer.\n",
+                     user_role="### Instruction:\n", bot_role="\n### Response:\n", history_sep="\n<|EOT|>\n", dtype=dtype)
+```
diff --git a/include/models/basellm.h b/include/models/basellm.h
@@ -135,7 +135,7 @@ namespace fastllm {
         int embed_dim = 4096;
         int num_attention_heads = 32;
         int head_dim = embed_dim / num_attention_heads;
-        const int max_positions = 32768;
+        int max_positions = 32768;
         int rotary_dim = 64;
         const float scale_attn = sqrt(head_dim);
         int block_cnt = 28;

diff --git a/include/models/chatglm.h b/include/models/chatglm.h
@@ -66,13 +66,13 @@ namespace fastllm {
 
         int GetVersion();
 
-        void UpdateSinCos(float rope);
+        void UpdateRotaryPosEmb(float rope_factor);
 
         int gmask_token_id;
     private:
         virtual void CausalMask(Data &data, int start) {}; // 因果mask？
 
-        float rope = 1.0f;
+        float rope_factor = 1.0f;
     };
 }
 

diff --git a/include/models/llama.h b/include/models/llama.h
@@ -11,10 +11,20 @@
 #include <iostream>
 
 namespace fastllm {
+
+    enum RoPEType { // 位置编码外推类型
+        BASE = 0,
+        LINEAR_SCALE = 1,
+        STATIC_NTK = 2,
+        DYMAMIC_NTK = 3
+    };
+
     class LlamaModel: public basellm {
     public:
         LlamaModel (); // 构造函数
 
+        virtual void InitParams(); // 初始化参数信息
+
         // 推理
         virtual int Forward(
                 const Data &inputIds,
@@ -65,6 +75,15 @@ namespace fastllm {
         virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
 
         virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
+
+        std::pair<std::vector<float>, std::vector<float>> UpdateRotaryPosEmb(float base, float factor); // 更新位置编码
+
+    protected:
+        RoPEType rope_type = RoPEType::BASE;
+
+        float rope_base = 10000.f;
+
+        float rope_factor = 1.f;
     };
 }
 

diff --git a/src/model.cpp b/src/model.cpp
@@ -104,8 +104,11 @@ namespace fastllm {
             model->model_type = "internlm";
         } else if (modelType == "llama") {
             model = (basellm*)(new LlamaModel());
-	} else if (modelType=="minicpm") {
-	    model = (basellm*)(new MiniCpmModel());
+        } else if (modelType == "qwen2") {
+            model = new LlamaModel();
+            model->model_type = "qwen";
+        } else if (modelType=="minicpm") {
+	          model = (basellm*)(new MiniCpmModel());
         } else if (modelType == "qwen") {
             model = (basellm *) (new QWenModel());
             model->weight.tokenizer.type = Tokenizer::TokenizerType::QWEN;

diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp
@@ -25,23 +25,23 @@
 #endif
 
 namespace fastllm {
-    void ChatGLMModel::UpdateSinCos(float rope) {
-        if (rope == this->rope) {
+    void ChatGLMModel::UpdateRotaryPosEmb(float rope_factor) {
+        if (rope_factor == this->rope_factor) {
             return;
         }
-        this->rope = rope;
+        this->rope_factor = rope_factor;
         sin.resize(max_positions);
         cos.resize(max_positions);
         std::vector <float> invFreq;
         for (int i = 0; i < rotary_dim; i += 2) {
-            int base = this->bot_role.empty() ? 10000 : 10000 * rope;
+            int base = this->bot_role.empty() ? 10000 : 10000 * this->rope_factor;
             invFreq.push_back(1.0 / pow(base, (float)i / rotary_dim));
         }
         for (int i = 0; i < max_positions; i++) {
             sin[i].resize(rotary_dim);
             cos[i].resize(rotary_dim);
             for (int j = 0; j < invFreq.size(); j++) {
-                float scale = this->bot_role.empty() ? rope : 1.0f;
+                float scale = this->bot_role.empty() ? rope_factor : 1.0f;
                 sin[i][j] = ::sin((float)i / scale * invFreq[j]);
                 cos[i][j] = ::cos((float)i / scale * invFreq[j]);
             }
@@ -65,8 +65,8 @@ namespace fastllm {
         this->eos_token_id = 130005;    // V1 后期版本 eos token，可通过 config.json 覆盖
         this->gmask_token_id= 150001;   // V1最初版本, 150528 tokens，部分 config.json 没有 gmask_token_id，因此取默认值。
 
-        this->rope = -1.0;
-        this->UpdateSinCos(1.0f);
+        this->rope_factor = -1.0;
+        this->UpdateRotaryPosEmb(1.0f);
         weight.embeddingNames.insert("transformer.word_embeddings.weight");
         weight.embeddingNames.insert("transformer.embedding.word_embeddings.weight");
     }
@@ -82,7 +82,7 @@ namespace fastllm {
             this->bos_token_id = 64792;
         }
         if (this->weight.dicts.find("rope_ratio") != this->weight.dicts.end()) {
-            UpdateSinCos(atof(this->weight.dicts["rope_ratio"].c_str()));
+            UpdateRotaryPosEmb(atof(this->weight.dicts["rope_ratio"].c_str()));
         }
     }