From 3aedc7a40f1eb338a91e51139ca7fa5fd3134dfc Mon Sep 17 00:00:00 2001 From: TylunasLi Date: Sun, 12 Nov 2023 12:05:08 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=91=BD=E4=BB=A4?= =?UTF-8?q?=E8=A1=8C=E7=A4=BA=E4=BE=8B=EF=BC=8C=E5=B0=9D=E8=AF=95=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E6=9B=B4=E5=A4=9A=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/models/chatglm.h | 4 +-- pyfastllm/examples/cli_low_level.py | 13 +++++---- pyfastllm/examples/cli_simple.py | 41 ++++++++++++++++++++++------- pyfastllm/fastllm/__init__.py | 2 +- src/pybinding.cpp | 1 + 5 files changed, 44 insertions(+), 17 deletions(-) diff --git a/include/models/chatglm.h b/include/models/chatglm.h index 61a9a61d..c0a341c5 100644 --- a/include/models/chatglm.h +++ b/include/models/chatglm.h @@ -67,10 +67,10 @@ namespace fastllm { int GetVersion(); void UpdateSinCos(float rope); - private: - virtual void CausalMask(Data &data, int start) {}; // 因果mask? int gmask_token_id; + private: + virtual void CausalMask(Data &data, int start) {}; // 因果mask? float rope = 1.0f; }; diff --git a/pyfastllm/examples/cli_low_level.py b/pyfastllm/examples/cli_low_level.py index a67f5928..35e2523a 100644 --- a/pyfastllm/examples/cli_low_level.py +++ b/pyfastllm/examples/cli_low_level.py @@ -20,11 +20,14 @@ def args_parser(): def response(model, prompt_input:str, stream_output:bool=False): gmask_token_id = 130001 bos_token_id = 130004 - eos_token_id = 130005 + eos_token_id = model.eos_token_id input_ids = model.weight.tokenizer.encode(prompt_input) - gmask_bos = fastllm.Tensor(fastllm.float32, [1, 2], [gmask_token_id, bos_token_id]) - input_ids = fastllm.cat([input_ids, gmask_bos], 0) + if model.model_type == "chatglm": + gmask_token_id = model.gmask_token_id + bos_token_id = model.bos_token_id + gmask_bos = fastllm.Tensor(fastllm.float32, [1, 2], [gmask_token_id, bos_token_id]) + input_ids = fastllm.cat([gmask_bos, input_ids], 0) seq_len = input_ids.count(0) vmask = [0] * (seq_len * seq_len) @@ -84,11 +87,11 @@ def run_with_low_level(args): prompt = "" while prompt != "stop": prompt = input("User: ") - outputs = response(model, prompt_input=prompt) + outputs = response(model, prompt_input=model.make_input("", 0, prompt)) for output in outputs: print(output) sys.stdout.flush() if __name__ == "__main__": args = args_parser() - run_with_low_level(args) \ No newline at end of file + run_with_low_level(args) diff --git a/pyfastllm/examples/cli_simple.py b/pyfastllm/examples/cli_simple.py index 5680758d..8ecef276 100644 --- a/pyfastllm/examples/cli_simple.py +++ b/pyfastllm/examples/cli_simple.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -import sys +import sys, os import platform import logging import argparse @@ -18,16 +18,15 @@ def args_parser(): def response(model, prompt_input:str, stream_output:bool=False): - gmask_token_id = 130001 - bos_token_id = 130004 input_ids = model.weight.tokenizer.encode(prompt_input) input_ids = input_ids.to_list() - input_ids.extend([gmask_token_id, bos_token_id]) input_ids = [int(v) for v in input_ids] + if model.model_type == "chatglm": + input_ids = [model.gmask_token_id, model.bos_token_id] + input_ids # print(input_ids) - handle = model.launch_response(input_ids) + handle = model.launch_response(input_ids, fastllm.GenerationConfig()) continue_token = True ret_byte = b"" @@ -54,19 +53,37 @@ def run_with_response(args): model.load_weights(model_path) model.warmup() else: + fastllm.set_threads(args.threads) + fastllm.set_low_memory(args.low) + if not os.path.exists(model_path): + print(f"模型文件{args.path}不存在!") + exit(-1) model = fastllm.create_llm(model_path) print(f"llm model: {model.model_type}") + print(f"欢迎使用 {model.model_type} 模型. 输入内容对话,reset清空历史记录,stop退出程序"); - prompt = "" - while prompt != "stop": - prompt = input("User: ") + input_text = "" + history = "" + dialog_round = 0 + while input_text != "stop": + input_text = input("User: ") + if 'stop' == input_text: + break + if 'reset' == input_text: + history = '' + continue + prompt = model.make_input(history, dialog_round, input_text) outputs = response(model, prompt_input=prompt, stream_output=True) print(f"{model.model_type}:", end=' ') + past_len = 0 for output in outputs: - print(f"\r{model.model_type}: {output}", end='', flush=True) + print(output[past_len:].strip(), end='', flush=True) + past_len = len(output) print() + model.make_history(history, dialog_round, input_text, output) + dialog_round += 1 def run_with_callback(args): @@ -78,6 +95,11 @@ def run_with_callback(args): model.load_weights(model_path) model.warmup() else: + fastllm.set_threads(args.threads) + fastllm.set_low_memory(args.low) + if not os.path.exists(model_path): + print(f"模型文件{args.path}不存在!") + exit(-1) LLM_TYPE = fastllm.get_llm_type(model_path) model = fastllm.create_llm(model_path) @@ -89,6 +111,7 @@ def print_back(idx:int, content: bytearray): print() sys.stdout.flush() + print(f"欢迎使用 {LLM_TYPE} 模型. 输入内容对话,reset清空历史记录,stop退出程序"); prompt = "" while prompt != "stop": prompt = input("User: ") diff --git a/pyfastllm/fastllm/__init__.py b/pyfastllm/fastllm/__init__.py index e5f51c8d..faeb6d9a 100644 --- a/pyfastllm/fastllm/__init__.py +++ b/pyfastllm/fastllm/__init__.py @@ -7,5 +7,5 @@ from . import utils from . import functions as ops -__version__ = "0.1.5" +__version__ = "0.2.0" diff --git a/src/pybinding.cpp b/src/pybinding.cpp index ca7c0aae..5452656f 100644 --- a/src/pybinding.cpp +++ b/src/pybinding.cpp @@ -294,6 +294,7 @@ PYBIND11_MODULE(pyfastllm, m) { .def_readonly("block_cnt", &fastllm::ChatGLMModel::block_cnt) .def_readonly("bos_token_id", &fastllm::ChatGLMModel::bos_token_id) .def_readonly("eos_token_id", &fastllm::ChatGLMModel::eos_token_id) + .def_readonly("gmask_token_id", &fastllm::ChatGLMModel::gmask_token_id) .def("load_weights", &fastllm::ChatGLMModel::LoadFromFile) .def("make_input", &fastllm::ChatGLMModel::MakeInput) .def("make_history", &fastllm::ChatGLMModel::MakeHistory) From 4e81b03b411a57914d2df2368f07e720413fca24 Mon Sep 17 00:00:00 2001 From: TylunasLi Date: Mon, 13 Nov 2023 22:08:00 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=96=87=E6=A1=A3?= =?UTF-8?q?=EF=BC=8C=E4=B8=BB=E8=A6=81=E6=98=AF=E5=AF=B9pybinding=E5=91=BD?= =?UTF-8?q?=E4=BB=A4=E8=A1=8C=E7=A4=BA=E4=BE=8B=E7=9A=84=E5=BC=95=E7=94=A8?= =?UTF-8?q?=EF=BC=8C=E6=81=A2=E5=A4=8DCPU=E7=BC=96=E8=AF=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 38 ++++++++++++------------ pyfastllm/README.md | 72 ++++++++++++++++++++++++--------------------- pyfastllm/setup.py | 6 ++-- 3 files changed, 62 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index c95ac6d9..fda9b404 100644 --- a/README.md +++ b/README.md @@ -196,21 +196,21 @@ for response in model.stream_response("你好"): 另外还可以设置cpu线程数等内容,详细API说明见 [fastllm_pytools](docs/fastllm_pytools) -这个包不包含low level api,如果需要使用更深入的功能请参考 [Python绑定](#Python绑定) +这个包不包含low level api,如果需要使用更深入的功能请参考 [Python绑定API](#Python绑定API) -## Python绑定 +## Python绑定API ``` -mkdir build-py -cd build-py -cmake .. -DPY_API=ON -DUSE_CUDA=ON (只使用CPU则使用 cmake .. -DPY_API=ON 即可) -make -j -cd - -python cli.py -m chatglm -p chatglm-6b-int8.bin 或 -python web_api.py -m chatglm -p chatglm-6b-int8.bin +cd pyfastllm +export USE_CUDA=OFF # 只使用CPU,如需使用GPU则去除本行 +python3 setup.py build +python3 setup.py install +cd examples/ +python cli_simple.py -m chatglm -p chatglm-6b-int8.flm 或 +python web_api.py -m chatglm -p chatglm-6b-int8.flm ``` -上述web api可使用python web_api_client.py进行测试 +上述web api可使用`web_api_client.py`进行测试。更多用法,详见[API文档](pyfastllm/README.md)。 ## 多卡部署 @@ -226,7 +226,7 @@ llm.set_device_map({"cuda:0" : 10, "cuda:1" : 5, "cpu": 1}) # 将模型按不同 ``` -### pybinding中使用多卡部署 +### Python绑定API中使用多卡部署 ``` python import pyfastllm as llm @@ -241,9 +241,7 @@ llm.set_device_map({"cuda:0" : 10, "cuda:1" : 5, "cpu": 1}) # 将模型按不同 fastllm::SetDeviceMap({{"cuda:0", 10}, {"cuda:1", 5}, {"cpu", 1}}); // 将模型按不同比例部署在多个设备上 ``` -## Android上使用 - -### Docker 编译运行 +## Docker 编译运行 docker 运行需要本地安装好 NVIDIA Runtime,且修改默认 runtime 为 nvidia 1. 安装 nvidia-container-runtime @@ -283,6 +281,8 @@ models DOCKER_BUILDKIT=0 docker compose up -d --build ``` +## Android上使用 + ### 编译 ``` sh # 在PC上编译需要下载NDK工具 @@ -324,7 +324,7 @@ python3 tools/chatglm_export.py chatglm2-6b-int8.flm int8 #导出int8模型 python3 tools/chatglm_export.py chatglm2-6b-int4.flm int4 #导出int4模型 ``` -### baichuan模型导出 (默认脚本导出baichuan-13b-chat模型) +#### baichuan模型导出 (默认脚本导出baichuan-13b-chat模型) ``` sh # 需要先安装baichuan环境 @@ -336,7 +336,7 @@ python3 tools/baichuan2flm.py baichuan-13b-int8.flm int8 #导出int8模型 python3 tools/baichuan2flm.py baichuan-13b-int4.flm int4 #导出int4模型 ``` -### baichuan2模型导出 (默认脚本导出baichuan2-7b-chat模型) +#### baichuan2模型导出 (默认脚本导出baichuan2-7b-chat模型) ``` sh # 需要先安装baichuan2环境 @@ -348,7 +348,7 @@ python3 tools/baichuan2_2flm.py baichuan2-7b-int8.flm int8 #导出int8模型 python3 tools/baichuan2_2flm.py baichuan2-7b-int4.flm int4 #导出int4模型 ``` -### MOSS模型导出 +#### MOSS模型导出 ``` sh # 需要先安装MOSS环境 @@ -360,13 +360,13 @@ python3 tools/moss_export.py moss-int8.flm int8 #导出int8模型 python3 tools/moss_export.py moss-int4.flm int4 #导出int4模型 ``` -### LLAMA系列模型导出 +#### LLAMA系列模型导出 ``` sh # 修改build/tools/alpaca2flm.py程序进行导出 # 不同llama模型使用的指令相差很大,需要参照torch2flm.py中的参数进行配置 ``` -### QWEN模型导出 +#### QWEN模型导出 ```sh # 需要先安装QWen环境 # 如果使用自己finetune的模型需要修改qwen2flm.py文件中创建tokenizer, model的代码 diff --git a/pyfastllm/README.md b/pyfastllm/README.md index 46b65281..1fbe7325 100644 --- a/pyfastllm/README.md +++ b/pyfastllm/README.md @@ -57,16 +57,10 @@ Cpp手动编译: mkdir build-py cd build-py cmake .. -DUSE_CUDA=ON -DPY_API=ON -make -j4 -python cli.py -p chatglm-6b-int8.bin -t 8 # 与cpp编译的运行结果保持一致 -``` - -Python脚本编译: - -```sh -cd pyfastllm -python build_libs --cuda -python cli.py -p chatglm-6b-int8.bin -t 8 +make -j +cp pyfastllm*.so pyfastllm/examples/ +cd ../pyfastllm/examples/ +python3 cli_simple.py -p chatglm-6b-int8.flm # 与cpp编译的运行结果保持一致 ``` ### wheel包方式 @@ -79,24 +73,35 @@ python cli.py -p chatglm-6b-int8.bin -t 8 pip install pybind11 ``` +- GPU ```sh -cd pyfastllm -python setup.py build -python setup.py install -python cli.py -p chatglm-6b-int8.bin -t 8 +cd pyfastllm/ +python3 setup.py build +python3 setup.py install +cd examples/ +python3 cli_simple.py -p chatglm-6b-int8.flm ``` +- CPU +```sh +cd pyfastllm/ +export USE_CUDA=OFF +python3 setup.py build +python3 setup.py install +cd examples/ +python3 cli_simple.py -p chatglm-6b-int8.flm -t 8 + +``` ## 使用 ### python 调用 -在demo文件夹中存放了几种常见的代码示例: +在examples文件夹中存放了几种常见的代码示例: -demo/cli.py: 以回调函数方式输出回答示例 -demo/cli_thread.py: 多线程调用api接口示例(推荐) -demo/cli_low_api.py: 底层API调用示例 -demo/convert_model.py: 模型转换示例 -demo/web_api.py, demo/web_api_client.py: fastapi webapi调用 -demo/test_ops: 部分op的使用样例及测试 +examples/cli_simple.py: 调用api接口示例(推荐) +examples/cli_low_api.py: 底层API调用示例 +examples/convert_model.py: 模型转换示例 +examples/web_api.py, demo/web_api_client.py: fastapi webapi调用 +examples/test_ops: 部分op的使用样例及测试 ### 命令行工具 @@ -111,22 +116,22 @@ $ fastllm-convert -m chatglm6B -p hf_model_path -o output_flm_path ```sh mkdir build-py cd build-py && cmake .. -DPY_API=ON -DUSE_CUDA=ON && make -j && cd - -cd pyfastllm/demo +cd pyfastllm/examples python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32 ``` 可以使用locust进行压测。A100 40G,chatglm fp16 压测部分结果如下: | 并发数 | 平均调用时间(s) | TP95(s) | TP99(s) | -|----------:|------|------|------| -| 1 | 3.07 | 4.2 | 4.8 | -| 10 | 6.11 | 11.0 | 12.0 | -| 16 | 6.82 | 15.0 | 16.0 | +|----------:|-------|------|------| +| 1 | 3.07 | 4.2 | 4.8 | +| 10 | 6.11 | 11.0 | 12.0 | +| 16 | 6.82 | 15.0 | 16.0 | | 32 | 10.74 | 16.0 | 20.0 | ## API编程接口 ### fastllm数据结构 > fattllm.Tensor数据类型 -- fastllm.float32 +- fastllm.float32 - fastllm.bfloat16 - fastllm.int16 - fastllm.int8 @@ -192,12 +197,13 @@ python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32 支持的模型列表: -| 模型名称 | 对应类 | 备注 -| -- | -- | -- -| ChatGLM-6B | fastllm.ChatGLMModel | -| ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本 -| Moss | fastllm.MossModel | -| Alpaca | fastllm.llamaModel | +| 模型名称 | 对应类 | 备注 | +| ---- | ---- | ---- | +| ChatGLM-6B | fastllm.ChatGLMModel | | +| ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本 | +| Moss | fastllm.MossModel | | +| Alpaca | fastllm.LlamaModel | | +| QWen | fastllm.QWenModel | | ## 开发计划(TODO) diff --git a/pyfastllm/setup.py b/pyfastllm/setup.py index 7b6449aa..8cc935bd 100644 --- a/pyfastllm/setup.py +++ b/pyfastllm/setup.py @@ -45,6 +45,8 @@ def build_extension(self, ext: CMakeExtension) -> None: # CMake lets you override the generator - we need to check this. # Can be set with Conda-Build, for example. cmake_generator = os.environ.get("CMAKE_GENERATOR", "") + + use_cuda = os.environ.get("USE_CUDA", "ON") # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code @@ -54,7 +56,7 @@ def build_extension(self, ext: CMakeExtension) -> None: f"-DPYTHON_EXECUTABLE={sys.executable}", f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm f"-DPY_API=ON", - f"-DUSE_CUDA=ON", + f"-DUSE_CUDA={use_cuda}", ] build_args = [] # Adding CMake arguments set as environment variable @@ -158,4 +160,4 @@ def build_extension(self, ext: CMakeExtension) -> None: 'LLM::Moss', 'LLM::LLama' ] -) \ No newline at end of file +)