From f3af399f291359ce559305ad2052a47d5fb967d1 Mon Sep 17 00:00:00 2001
From: TylunasLi <pwstudio@163.com>
Date: Wed, 28 Feb 2024 22:36:00 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=90=88=E5=B9=B6=E8=BD=AC?=
 =?UTF-8?q?=E6=8D=A2=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                          | 13 ++++++++++++-
 tools/fastllm_pytools/hf_model.py  |  7 +++++++
 tools/fastllm_pytools/torch2flm.py |  7 +++++++
 tools/scripts/internlm2flm.py      | 16 ----------------
 tools/scripts/llamalike2flm.py     | 24 ++++++++++++++++++++++++
 tools/scripts/qwen2_2flm.py        | 14 --------------
 6 files changed, 50 insertions(+), 31 deletions(-)
 delete mode 100755 tools/scripts/internlm2flm.py
 create mode 100644 tools/scripts/llamalike2flm.py
 delete mode 100644 tools/scripts/qwen2_2flm.py

diff --git a/README.md b/README.md
index 98f8445e..1d2d647b 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ fastllm是纯c++实现，无第三方依赖的高性能大模型推理库
 - 🚀 支持流式输出，很方便实现打字机效果
 - 🚀 支持python调用
 - 🚀 前后端分离设计，便于支持新的计算设备
-- 🚀 目前支持ChatGLM模型，各种LLAMA模型(ALPACA, VICUNA等)，BAICHUAN模型，MOSS模型
+- 🚀 目前支持ChatGLM系列模型，各种LLAMA模型(ALPACA, VICUNA等)，BAICHUAN模型，QWEN模型，MOSS模型等
 
 ## 两行代码加速 （测试中，暂时只支持chatglm系列）
 
@@ -379,6 +379,17 @@ python3 tools/qwen2flm.py qwen-7b-int8.flm int8 #导出int8模型
 python3 tools/qwen2flm.py qwen-7b-int4.flm int4 #导出int4模型
 ```
 
+* **Qwen1.5**
+
+```sh
+# 需要先安装QWen2环境（transformers >= 4.37.0）
+# 根据所需的精度，导出相应的模型
+python3 tools/llamalike2flm.py qwen1.5-7b-fp16.flm float16 "qwen/Qwen1.5-4B-Chat" #导出wen1.5-4B-Chat float16模型
+python3 tools/llamalike2flm.py qwen1.5-7b-int8.flm int8 "qwen/Qwen1.5-7B-Chat" #导出Qwen1.5-7B-Chat int8模型
+python3 tools/llamalike2flm.py qwen1.5-7b-int4.flm int4 "qwen/Qwen1.5-14B-Chat" #导出Qwen1.5-14B-Chat int4模型
+# 最后一个参数可替换为模型路径
+```
+
 ## 开发计划
 
 也就是俗称的画饼部分，大家如果有需要的功能可以在讨论区提出
diff --git a/tools/fastllm_pytools/hf_model.py b/tools/fastllm_pytools/hf_model.py
index 538ad36b..cdf59484 100644
--- a/tools/fastllm_pytools/hf_model.py
+++ b/tools/fastllm_pytools/hf_model.py
@@ -60,6 +60,13 @@ def create(model,
             modelInfo["im_start_id"] = tokenizer.im_start_id
     elif (modelInfo["model_type"] == "qwen2"):
         modelInfo["eos_token_id"] = "151645"
+    elif (modelInfo["model_type"] == "internlm"):
+        modelInfo["eos_token_id"] = "103028"
+        if "rotary" in modelInfo:
+            rope_scaling = modelInfo.pop("rotary")
+            if isinstance(rope_scaling, builtins.dict):
+                modelInfo["rope_scaling.type"] = rope_scaling["type"]
+                modelInfo["rope_theta"] = rope_scaling["base"]
     if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")):
         # chatglm3
         modelInfo["pre_prompt"] = "";
diff --git a/tools/fastllm_pytools/torch2flm.py b/tools/fastllm_pytools/torch2flm.py
index 219f3a59..f86bc7ab 100644
--- a/tools/fastllm_pytools/torch2flm.py
+++ b/tools/fastllm_pytools/torch2flm.py
@@ -118,6 +118,13 @@ def tofile(exportPath,
             modelInfo["im_start_id"] = tokenizer.im_start_id
     elif (modelInfo["model_type"] == "qwen2"):
         modelInfo["eos_token_id"] = "151645"
+    elif (modelInfo["model_type"] == "internlm"):
+        modelInfo["eos_token_id"] = "103028"
+        if "rotary" in modelInfo:
+            rope_scaling = modelInfo.pop("rotary")
+            if isinstance(rope_scaling, builtins.dict):
+                modelInfo["rope_scaling.type"] = rope_scaling["type"]
+                modelInfo["rope_theta"] = rope_scaling["base"]
     if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")):
         # chatglm3
         modelInfo["pre_prompt"] = "";
diff --git a/tools/scripts/internlm2flm.py b/tools/scripts/internlm2flm.py
deleted file mode 100755
index f2b6a847..00000000
--- a/tools/scripts/internlm2flm.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import sys
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from fastllm_pytools import torch2flm
-
-if __name__ == "__main__":
-    modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else "internlm/internlm-chat-7b-v1_1"
-    tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, trust_remote_code=True);
-    # `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32.
-    model = AutoModelForCausalLM.from_pretrained(modelNameOrPath, trust_remote_code=True, torch_dtype=torch.float16)
-    model = model.eval()
-    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
-    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "internlm-7b-" + dtype + ".flm"
-    torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "<s><s>", 
-                     user_role = "<|User|>:", bot_role = "<eoh>\n<|Bot|>:", 
-                     history_sep = "<eoa>\n<s>", dtype = dtype)
diff --git a/tools/scripts/llamalike2flm.py b/tools/scripts/llamalike2flm.py
new file mode 100644
index 00000000..613c827c
--- /dev/null
+++ b/tools/scripts/llamalike2flm.py
@@ -0,0 +1,24 @@
+import sys
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from fastllm_pytools import torch2flm
+
+if __name__ == "__main__":
+    modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else 'qwen/Qwen1.5-7B-Chat'
+    tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, trust_remote_code=True);
+    # `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32.
+    model = AutoModelForCausalLM.from_pretrained(modelNameOrPath, trust_remote_code=True, torch_dtype=torch.float16)
+    model = model.eval()
+    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
+    exportPath = sys.argv[1] if len(sys.argv) >= 2 else model.config.model_type + "-7b-" + dtype + ".flm"
+    if model.config.model_type == "internlm":
+        torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "<s>", 
+                         user_role = "<|User|>:", bot_role = "<eoh>\n<|Bot|>:", 
+                         history_sep = "<eoa>\n<s>", dtype = dtype)
+    elif model.config.model_type == "qwen2":
+        torch2flm.tofile(exportPath, model, tokenizer, pre_prompt="<|im_start|>system\nYou are a helpful assistant.<|im_end|>", user_role="<|im_start|>user\n",
+                         bot_role="<|im_end|><|im_start|>assistant\n", history_sep="<|im_end|>\n", dtype = dtype)
+    # add custom code here
+    else:
+        torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", user_role = "", 
+                         bot_role = "", history_sep = "", dtype = dtype)
diff --git a/tools/scripts/qwen2_2flm.py b/tools/scripts/qwen2_2flm.py
deleted file mode 100644
index 628aefd1..00000000
--- a/tools/scripts/qwen2_2flm.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import sys
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from fastllm_pytools import torch2flm
-
-if __name__ == "__main__":
-    model_name = sys.argv[3] if len(sys.argv) >= 4 else 'qwen/Qwen1.5-7B-Chat'
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype=torch.float16)
-    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
-    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "qwen1.5-7b-" + dtype + ".flm"
-    # add custom code here
-    torch2flm.tofile(exportPath, model, tokenizer, pre_prompt="<|im_start|>system\nYou are a helpful assistant.<|im_end|>", user_role="<|im_start|>user\n",
-                     bot_role="<|im_end|><|im_start|>assistant\n", history_sep="<|im_end|>\n", dtype = dtype)