From f3af399f291359ce559305ad2052a47d5fb967d1 Mon Sep 17 00:00:00 2001 From: TylunasLi Date: Wed, 28 Feb 2024 22:36:00 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=90=88=E5=B9=B6=E8=BD=AC?= =?UTF-8?q?=E6=8D=A2=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 13 ++++++++++++- tools/fastllm_pytools/hf_model.py | 7 +++++++ tools/fastllm_pytools/torch2flm.py | 7 +++++++ tools/scripts/internlm2flm.py | 16 ---------------- tools/scripts/llamalike2flm.py | 24 ++++++++++++++++++++++++ tools/scripts/qwen2_2flm.py | 14 -------------- 6 files changed, 50 insertions(+), 31 deletions(-) delete mode 100755 tools/scripts/internlm2flm.py create mode 100644 tools/scripts/llamalike2flm.py delete mode 100644 tools/scripts/qwen2_2flm.py diff --git a/README.md b/README.md index 98f8445e..1d2d647b 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ fastllm是纯c++实现,无第三方依赖的高性能大模型推理库 - 🚀 支持流式输出,很方便实现打字机效果 - 🚀 支持python调用 - 🚀 前后端分离设计,便于支持新的计算设备 -- 🚀 目前支持ChatGLM模型,各种LLAMA模型(ALPACA, VICUNA等),BAICHUAN模型,MOSS模型 +- 🚀 目前支持ChatGLM系列模型,各种LLAMA模型(ALPACA, VICUNA等),BAICHUAN模型,QWEN模型,MOSS模型等 ## 两行代码加速 (测试中,暂时只支持chatglm系列) @@ -379,6 +379,17 @@ python3 tools/qwen2flm.py qwen-7b-int8.flm int8 #导出int8模型 python3 tools/qwen2flm.py qwen-7b-int4.flm int4 #导出int4模型 ``` +* **Qwen1.5** + +```sh +# 需要先安装QWen2环境(transformers >= 4.37.0) +# 根据所需的精度,导出相应的模型 +python3 tools/llamalike2flm.py qwen1.5-7b-fp16.flm float16 "qwen/Qwen1.5-4B-Chat" #导出wen1.5-4B-Chat float16模型 +python3 tools/llamalike2flm.py qwen1.5-7b-int8.flm int8 "qwen/Qwen1.5-7B-Chat" #导出Qwen1.5-7B-Chat int8模型 +python3 tools/llamalike2flm.py qwen1.5-7b-int4.flm int4 "qwen/Qwen1.5-14B-Chat" #导出Qwen1.5-14B-Chat int4模型 +# 最后一个参数可替换为模型路径 +``` + ## 开发计划 也就是俗称的画饼部分,大家如果有需要的功能可以在讨论区提出 diff --git a/tools/fastllm_pytools/hf_model.py b/tools/fastllm_pytools/hf_model.py index 538ad36b..cdf59484 100644 --- a/tools/fastllm_pytools/hf_model.py +++ b/tools/fastllm_pytools/hf_model.py @@ -60,6 +60,13 @@ def create(model, modelInfo["im_start_id"] = tokenizer.im_start_id elif (modelInfo["model_type"] == "qwen2"): modelInfo["eos_token_id"] = "151645" + elif (modelInfo["model_type"] == "internlm"): + modelInfo["eos_token_id"] = "103028" + if "rotary" in modelInfo: + rope_scaling = modelInfo.pop("rotary") + if isinstance(rope_scaling, builtins.dict): + modelInfo["rope_scaling.type"] = rope_scaling["type"] + modelInfo["rope_theta"] = rope_scaling["base"] if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")): # chatglm3 modelInfo["pre_prompt"] = ""; diff --git a/tools/fastllm_pytools/torch2flm.py b/tools/fastllm_pytools/torch2flm.py index 219f3a59..f86bc7ab 100644 --- a/tools/fastllm_pytools/torch2flm.py +++ b/tools/fastllm_pytools/torch2flm.py @@ -118,6 +118,13 @@ def tofile(exportPath, modelInfo["im_start_id"] = tokenizer.im_start_id elif (modelInfo["model_type"] == "qwen2"): modelInfo["eos_token_id"] = "151645" + elif (modelInfo["model_type"] == "internlm"): + modelInfo["eos_token_id"] = "103028" + if "rotary" in modelInfo: + rope_scaling = modelInfo.pop("rotary") + if isinstance(rope_scaling, builtins.dict): + modelInfo["rope_scaling.type"] = rope_scaling["type"] + modelInfo["rope_theta"] = rope_scaling["base"] if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")): # chatglm3 modelInfo["pre_prompt"] = ""; diff --git a/tools/scripts/internlm2flm.py b/tools/scripts/internlm2flm.py deleted file mode 100755 index f2b6a847..00000000 --- a/tools/scripts/internlm2flm.py +++ /dev/null @@ -1,16 +0,0 @@ -import sys -import torch -from transformers import AutoTokenizer, AutoModelForCausalLM -from fastllm_pytools import torch2flm - -if __name__ == "__main__": - modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else "internlm/internlm-chat-7b-v1_1" - tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, trust_remote_code=True); - # `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32. - model = AutoModelForCausalLM.from_pretrained(modelNameOrPath, trust_remote_code=True, torch_dtype=torch.float16) - model = model.eval() - dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16" - exportPath = sys.argv[1] if len(sys.argv) >= 2 else "internlm-7b-" + dtype + ".flm" - torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", - user_role = "<|User|>:", bot_role = "\n<|Bot|>:", - history_sep = "\n", dtype = dtype) diff --git a/tools/scripts/llamalike2flm.py b/tools/scripts/llamalike2flm.py new file mode 100644 index 00000000..613c827c --- /dev/null +++ b/tools/scripts/llamalike2flm.py @@ -0,0 +1,24 @@ +import sys +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +from fastllm_pytools import torch2flm + +if __name__ == "__main__": + modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else 'qwen/Qwen1.5-7B-Chat' + tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, trust_remote_code=True); + # `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32. + model = AutoModelForCausalLM.from_pretrained(modelNameOrPath, trust_remote_code=True, torch_dtype=torch.float16) + model = model.eval() + dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16" + exportPath = sys.argv[1] if len(sys.argv) >= 2 else model.config.model_type + "-7b-" + dtype + ".flm" + if model.config.model_type == "internlm": + torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", + user_role = "<|User|>:", bot_role = "\n<|Bot|>:", + history_sep = "\n", dtype = dtype) + elif model.config.model_type == "qwen2": + torch2flm.tofile(exportPath, model, tokenizer, pre_prompt="<|im_start|>system\nYou are a helpful assistant.<|im_end|>", user_role="<|im_start|>user\n", + bot_role="<|im_end|><|im_start|>assistant\n", history_sep="<|im_end|>\n", dtype = dtype) + # add custom code here + else: + torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", user_role = "", + bot_role = "", history_sep = "", dtype = dtype) diff --git a/tools/scripts/qwen2_2flm.py b/tools/scripts/qwen2_2flm.py deleted file mode 100644 index 628aefd1..00000000 --- a/tools/scripts/qwen2_2flm.py +++ /dev/null @@ -1,14 +0,0 @@ -import sys -import torch -from transformers import AutoTokenizer, AutoModelForCausalLM -from fastllm_pytools import torch2flm - -if __name__ == "__main__": - model_name = sys.argv[3] if len(sys.argv) >= 4 else 'qwen/Qwen1.5-7B-Chat' - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype=torch.float16) - dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16" - exportPath = sys.argv[1] if len(sys.argv) >= 2 else "qwen1.5-7b-" + dtype + ".flm" - # add custom code here - torch2flm.tofile(exportPath, model, tokenizer, pre_prompt="<|im_start|>system\nYou are a helpful assistant.<|im_end|>", user_role="<|im_start|>user\n", - bot_role="<|im_end|><|im_start|>assistant\n", history_sep="<|im_end|>\n", dtype = dtype)