Merge pull request #370 from TylunasLi/pyfastllm

修复Pyfastllm的命令行示例并更新文档
ztxz16 · Nov 14, 2023 · 425efee · 425efee
2 parents 1f4628a + 4e81b03
commit 425efee
Show file tree

Hide file tree

Showing 8 changed files with 106 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -196,21 +196,21 @@ for response in model.stream_response("你好"):
 
 另外还可以设置cpu线程数等内容，详细API说明见 [fastllm_pytools](docs/fastllm_pytools)
 
-这个包不包含low level api，如果需要使用更深入的功能请参考 [Python绑定](#Python绑定)
+这个包不包含low level api，如果需要使用更深入的功能请参考 [Python绑定API](#Python绑定API)
 
 
-## Python绑定
+## Python绑定API
 
 ```
-mkdir build-py
-cd build-py
-cmake .. -DPY_API=ON -DUSE_CUDA=ON （只使用CPU则使用 cmake .. -DPY_API=ON 即可）
-make -j
-cd -
-python cli.py  -m chatglm -p chatglm-6b-int8.bin 或  
-python web_api.py  -m chatglm -p chatglm-6b-int8.bin  
+cd pyfastllm
+export USE_CUDA=OFF    # 只使用CPU，如需使用GPU则去除本行
+python3 setup.py build
+python3 setup.py install 
+cd examples/
+python cli_simple.py  -m chatglm -p chatglm-6b-int8.flm 或  
+python web_api.py  -m chatglm -p chatglm-6b-int8.flm  
 ```
-上述web api可使用python web_api_client.py进行测试
+上述web api可使用`web_api_client.py`进行测试。更多用法，详见[API文档](pyfastllm/README.md)。
 
 ## 多卡部署
 
@@ -226,7 +226,7 @@ llm.set_device_map({"cuda:0" : 10, "cuda:1" : 5, "cpu": 1}) # 将模型按不同
 
 ```
 
-### pybinding中使用多卡部署
+### Python绑定API中使用多卡部署
 
 ``` python
 import pyfastllm as llm
@@ -241,9 +241,7 @@ llm.set_device_map({"cuda:0" : 10, "cuda:1" : 5, "cpu": 1}) # 将模型按不同
 fastllm::SetDeviceMap({{"cuda:0", 10}, {"cuda:1", 5}, {"cpu", 1}}); // 将模型按不同比例部署在多个设备上
 ```
 
-## Android上使用
-
-### Docker 编译运行
+## Docker 编译运行
 docker 运行需要本地安装好 NVIDIA Runtime,且修改默认 runtime 为 nvidia
 
 1. 安装 nvidia-container-runtime
@@ -283,6 +281,8 @@ models
 DOCKER_BUILDKIT=0 docker compose up -d --build
 ```
 
+## Android上使用
+
 ### 编译
 ``` sh
 # 在PC上编译需要下载NDK工具
@@ -324,7 +324,7 @@ python3 tools/chatglm_export.py chatglm2-6b-int8.flm int8 #导出int8模型
 python3 tools/chatglm_export.py chatglm2-6b-int4.flm int4 #导出int4模型
 ```
 
-### baichuan模型导出 (默认脚本导出baichuan-13b-chat模型)
+#### baichuan模型导出 (默认脚本导出baichuan-13b-chat模型)
 
 ``` sh
 # 需要先安装baichuan环境
@@ -336,7 +336,7 @@ python3 tools/baichuan2flm.py baichuan-13b-int8.flm int8 #导出int8模型
 python3 tools/baichuan2flm.py baichuan-13b-int4.flm int4 #导出int4模型
 ```
 
-### baichuan2模型导出 (默认脚本导出baichuan2-7b-chat模型)
+#### baichuan2模型导出 (默认脚本导出baichuan2-7b-chat模型)
 
 ``` sh
 # 需要先安装baichuan2环境
@@ -348,7 +348,7 @@ python3 tools/baichuan2_2flm.py baichuan2-7b-int8.flm int8 #导出int8模型
 python3 tools/baichuan2_2flm.py baichuan2-7b-int4.flm int4 #导出int4模型
 ```
 
-### MOSS模型导出
+#### MOSS模型导出
 
 ``` sh
 # 需要先安装MOSS环境
@@ -360,13 +360,13 @@ python3 tools/moss_export.py moss-int8.flm int8 #导出int8模型
 python3 tools/moss_export.py moss-int4.flm int4 #导出int4模型
 ```
 
-### LLAMA系列模型导出
+#### LLAMA系列模型导出
 ``` sh
 # 修改build/tools/alpaca2flm.py程序进行导出
 # 不同llama模型使用的指令相差很大，需要参照torch2flm.py中的参数进行配置
 ```
 
-### QWEN模型导出
+#### QWEN模型导出
 ```sh
 # 需要先安装QWen环境
 # 如果使用自己finetune的模型需要修改qwen2flm.py文件中创建tokenizer, model的代码

diff --git a/include/models/chatglm.h b/include/models/chatglm.h
@@ -67,10 +67,10 @@ namespace fastllm {
         int GetVersion();
 
         void UpdateSinCos(float rope);
-    private:
-        virtual void CausalMask(Data &data, int start) {}; // 因果mask？
 
         int gmask_token_id;
+    private:
+        virtual void CausalMask(Data &data, int start) {}; // 因果mask？
 
         float rope = 1.0f;
     };

diff --git a/pyfastllm/README.md b/pyfastllm/README.md
@@ -57,16 +57,10 @@ Cpp手动编译：
 mkdir build-py
 cd build-py
 cmake .. -DUSE_CUDA=ON -DPY_API=ON
-make -j4
-python cli.py -p chatglm-6b-int8.bin -t 8  # 与cpp编译的运行结果保持一致
-```
-
-Python脚本编译：
-
-```sh
-cd pyfastllm
-python build_libs --cuda
-python cli.py -p chatglm-6b-int8.bin -t 8 
+make -j
+cp pyfastllm*.so pyfastllm/examples/
+cd ../pyfastllm/examples/
+python3 cli_simple.py -p chatglm-6b-int8.flm  # 与cpp编译的运行结果保持一致
 ```
 
 ### wheel包方式
@@ -79,24 +73,35 @@ python cli.py -p chatglm-6b-int8.bin -t 8
 pip install pybind11
 ```
 
+- GPU
 ```sh
-cd pyfastllm
-python setup.py build
-python setup.py install 
-python cli.py -p chatglm-6b-int8.bin -t 8 
+cd pyfastllm/
+python3 setup.py build
+python3 setup.py install 
+cd examples/
+python3 cli_simple.py -p chatglm-6b-int8.flm
 ```
 
+- CPU
+```sh
+cd pyfastllm/
+export USE_CUDA=OFF
+python3 setup.py build
+python3 setup.py install 
+cd examples/
+python3 cli_simple.py -p chatglm-6b-int8.flm -t 8
+
+```
 ## 使用
 
 ### python 调用
-在demo文件夹中存放了几种常见的代码示例：
+在examples文件夹中存放了几种常见的代码示例：
 
-demo/cli.py: 以回调函数方式输出回答示例
-demo/cli_thread.py: 多线程调用api接口示例(推荐)
-demo/cli_low_api.py: 底层API调用示例
-demo/convert_model.py: 模型转换示例
-demo/web_api.py, demo/web_api_client.py: fastapi webapi调用
-demo/test_ops: 部分op的使用样例及测试
+examples/cli_simple.py: 调用api接口示例(推荐)
+examples/cli_low_api.py: 底层API调用示例
+examples/convert_model.py: 模型转换示例
+examples/web_api.py, demo/web_api_client.py: fastapi webapi调用
+examples/test_ops: 部分op的使用样例及测试
 
 ### 命令行工具
 
@@ -111,22 +116,22 @@ $ fastllm-convert -m chatglm6B -p hf_model_path -o output_flm_path
 ```sh
 mkdir build-py
 cd build-py && cmake .. -DPY_API=ON -DUSE_CUDA=ON && make -j && cd -
-cd pyfastllm/demo
+cd pyfastllm/examples
 python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32
 ```
 可以使用locust进行压测。A100 40G，chatglm fp16 压测部分结果如下：
 |    并发数 | 平均调用时间(s) | TP95(s) | TP99(s) |
-|----------:|------|------|------|
-| 1         | 3.07 | 4.2  | 4.8 |
-| 10        | 6.11 | 11.0 | 12.0 |
-| 16        | 6.82 | 15.0 | 16.0 |
+|----------:|-------|------|------|
+| 1         |  3.07 |  4.2 |  4.8 |
+| 10        |  6.11 | 11.0 | 12.0 |
+| 16        |  6.82 | 15.0 | 16.0 |
 | 32        | 10.74 | 16.0 | 20.0 |
 ## API编程接口
 
 ### fastllm数据结构
 
 > fattllm.Tensor数据类型
-- fastllm.float32  
+- fastllm.float32
 - fastllm.bfloat16
 - fastllm.int16
 - fastllm.int8
@@ -192,12 +197,13 @@ python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32
 
 支持的模型列表：
 
-| 模型名称 | 对应类 | 备注 
-| -- | -- | -- 
-| ChatGLM-6B | fastllm.ChatGLMModel |
-| ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本
-| Moss | fastllm.MossModel |
-| Alpaca | fastllm.llamaModel | 
+| 模型名称 | 对应类 | 备注 |
+| ---- | ---- | ---- |
+| ChatGLM-6B | fastllm.ChatGLMModel |  |
+| ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本 |
+| Moss | fastllm.MossModel |  |
+| Alpaca | fastllm.LlamaModel |  |
+| QWen | fastllm.QWenModel |  |
 
 
 ## 开发计划(TODO)

diff --git a/pyfastllm/examples/cli_low_level.py b/pyfastllm/examples/cli_low_level.py
@@ -20,11 +20,14 @@ def args_parser():
 def response(model, prompt_input:str, stream_output:bool=False):
     gmask_token_id = 130001
     bos_token_id = 130004
-    eos_token_id = 130005
+    eos_token_id = model.eos_token_id
 
     input_ids = model.weight.tokenizer.encode(prompt_input)
-    gmask_bos = fastllm.Tensor(fastllm.float32, [1, 2], [gmask_token_id, bos_token_id])
-    input_ids = fastllm.cat([input_ids, gmask_bos], 0)
+    if model.model_type == "chatglm":
+        gmask_token_id = model.gmask_token_id
+        bos_token_id = model.bos_token_id
+        gmask_bos = fastllm.Tensor(fastllm.float32, [1, 2], [gmask_token_id, bos_token_id])
+        input_ids = fastllm.cat([gmask_bos, input_ids], 0)
 
     seq_len = input_ids.count(0)
     vmask = [0] * (seq_len * seq_len)
@@ -84,11 +87,11 @@ def run_with_low_level(args):
     prompt = ""
     while prompt != "stop":
         prompt = input("User: ")
-        outputs = response(model, prompt_input=prompt)
+        outputs = response(model, prompt_input=model.make_input("", 0, prompt))
         for output in outputs:
             print(output)
             sys.stdout.flush()
 
 if __name__ == "__main__":
     args = args_parser()
-    run_with_low_level(args)
+    run_with_low_level(args)
diff --git a/pyfastllm/examples/cli_simple.py b/pyfastllm/examples/cli_simple.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-import sys
+import sys, os
 import platform
 import logging
 import argparse
@@ -18,16 +18,15 @@ def args_parser():
 
 
 def response(model, prompt_input:str, stream_output:bool=False):
-    gmask_token_id = 130001
-    bos_token_id = 130004
 
     input_ids = model.weight.tokenizer.encode(prompt_input)
     input_ids = input_ids.to_list()
-    input_ids.extend([gmask_token_id, bos_token_id])
     input_ids = [int(v) for v in input_ids]
+    if model.model_type == "chatglm":
+        input_ids = [model.gmask_token_id, model.bos_token_id] + input_ids
     # print(input_ids)
 
-    handle = model.launch_response(input_ids)
+    handle = model.launch_response(input_ids, fastllm.GenerationConfig())
     continue_token = True
 
     ret_byte = b""
@@ -54,19 +53,37 @@ def run_with_response(args):
         model.load_weights(model_path)
         model.warmup()
     else:
+        fastllm.set_threads(args.threads)
+        fastllm.set_low_memory(args.low)
+        if not os.path.exists(model_path):
+            print(f"模型文件{args.path}不存在！")
+            exit(-1)
         model = fastllm.create_llm(model_path)
         print(f"llm model: {model.model_type}")
+    print(f"欢迎使用 {model.model_type} 模型. 输入内容对话，reset清空历史记录，stop退出程序");
 
-    prompt = ""
-    while prompt != "stop":
-        prompt = input("User: ")
+    input_text = ""
+    history = ""
+    dialog_round = 0
+    while input_text != "stop":
+        input_text = input("User: ")
+        if 'stop' == input_text:
+            break
+        if 'reset' == input_text:
+            history = ''
+            continue
+        prompt = model.make_input(history, dialog_round, input_text)
 
         outputs = response(model, prompt_input=prompt, stream_output=True)
 
         print(f"{model.model_type}:", end=' ')
+        past_len = 0
         for output in outputs:
-            print(f"\r{model.model_type}: {output}", end='', flush=True)
+            print(output[past_len:].strip(), end='', flush=True)
+            past_len = len(output)
         print()
+        model.make_history(history, dialog_round, input_text, output)
+        dialog_round += 1
 
 
 def run_with_callback(args):
@@ -78,6 +95,11 @@ def run_with_callback(args):
         model.load_weights(model_path)
         model.warmup()
     else:
+        fastllm.set_threads(args.threads)
+        fastllm.set_low_memory(args.low)
+        if not os.path.exists(model_path):
+            print(f"模型文件{args.path}不存在！")
+            exit(-1)
         LLM_TYPE = fastllm.get_llm_type(model_path)
         model = fastllm.create_llm(model_path)
 
@@ -89,6 +111,7 @@ def print_back(idx:int, content: bytearray):
             print()
         sys.stdout.flush()
 
+    print(f"欢迎使用 {LLM_TYPE} 模型. 输入内容对话，reset清空历史记录，stop退出程序");
     prompt = ""
     while prompt != "stop":
         prompt = input("User: ")

diff --git a/pyfastllm/fastllm/__init__.py b/pyfastllm/fastllm/__init__.py
@@ -7,5 +7,5 @@
 from . import utils
 from . import functions as ops
 
-__version__ = "0.1.5"
+__version__ = "0.2.0"
 
diff --git a/pyfastllm/setup.py b/pyfastllm/setup.py
@@ -45,6 +45,8 @@ def build_extension(self, ext: CMakeExtension) -> None:
         # CMake lets you override the generator - we need to check this.
         # Can be set with Conda-Build, for example.
         cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+
+        use_cuda = os.environ.get("USE_CUDA", "ON")
 
         # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
         # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
@@ -54,7 +56,7 @@ def build_extension(self, ext: CMakeExtension) -> None:
             f"-DPYTHON_EXECUTABLE={sys.executable}",
             f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
             f"-DPY_API=ON",
-            f"-DUSE_CUDA=ON",
+            f"-DUSE_CUDA={use_cuda}",
         ]
         build_args = []
         # Adding CMake arguments set as environment variable
@@ -158,4 +160,4 @@ def build_extension(self, ext: CMakeExtension) -> None:
         'LLM::Moss',
         'LLM::LLama'
     ]
-)
+)
diff --git a/src/pybinding.cpp b/src/pybinding.cpp
@@ -294,6 +294,7 @@ PYBIND11_MODULE(pyfastllm, m) {
     .def_readonly("block_cnt", &fastllm::ChatGLMModel::block_cnt)
     .def_readonly("bos_token_id", &fastllm::ChatGLMModel::bos_token_id)
     .def_readonly("eos_token_id", &fastllm::ChatGLMModel::eos_token_id)
+    .def_readonly("gmask_token_id", &fastllm::ChatGLMModel::gmask_token_id)
     .def("load_weights", &fastllm::ChatGLMModel::LoadFromFile)
     .def("make_input", &fastllm::ChatGLMModel::MakeInput)
     .def("make_history", &fastllm::ChatGLMModel::MakeHistory)