From dbac0775e85992c5c68889b3287c4cecda5ee3ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=AE=87?= <506610466@qq.com> Date: Sat, 26 Oct 2024 15:09:03 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E7=9A=84=E6=9E=B6=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gpt_server/model_backend/lmdeploy_backend.py | 4 +- gpt_server/model_backend/vllm_backend.py | 6 ++ .../model_worker/base/model_worker_base.py | 12 ++- gpt_server/script/config.yaml | 98 +++---------------- gpt_server/serving/server_ui.py | 5 +- gpt_server/utils.py | 46 +++++++-- 6 files changed, 75 insertions(+), 96 deletions(-) diff --git a/gpt_server/model_backend/lmdeploy_backend.py b/gpt_server/model_backend/lmdeploy_backend.py index dc61740..b745425 100644 --- a/gpt_server/model_backend/lmdeploy_backend.py +++ b/gpt_server/model_backend/lmdeploy_backend.py @@ -28,7 +28,7 @@ class LMDeployBackend(ModelBackend): def __init__(self, model_path) -> None: backend = backend_map[os.getenv("backend")] enable_prefix_caching = bool(os.getenv("enable_prefix_caching", False)) - + max_model_len = os.getenv("max_model_len", None) logger.info(f"后端 {backend}") if backend == "pytorch": backend_config = PytorchEngineConfig(tp=int(os.getenv("num_gpus", "1"))) @@ -36,6 +36,7 @@ def __init__(self, model_path) -> None: backend_config = TurbomindEngineConfig( tp=int(os.getenv("num_gpus", "1")), enable_prefix_caching=enable_prefix_caching, + session_len=int(max_model_len) if max_model_len else None, ) pipeline_type, pipeline_class = get_task(model_path) logger.info(f"模型架构:{pipeline_type}") @@ -78,6 +79,7 @@ async def stream_chat(self, params: Dict[str, Any]) -> AsyncGenerator: top_k=50 if top_k == -1 else top_k, stop_words=list(stop), skip_special_tokens=True, + response_format=params["response_format"], ) logger.info(f"request_id {int(request_id)}") results_generator = self.async_engine.generate( diff --git a/gpt_server/model_backend/vllm_backend.py b/gpt_server/model_backend/vllm_backend.py index 7aaf55f..98cf07c 100644 --- a/gpt_server/model_backend/vllm_backend.py +++ b/gpt_server/model_backend/vllm_backend.py @@ -27,7 +27,11 @@ class VllmBackend(ModelBackend): def __init__(self, model_path) -> None: lora = os.getenv("lora", None) enable_prefix_caching = bool(os.getenv("enable_prefix_caching", False)) + + max_model_len = os.getenv("max_model_len", None) + tensor_parallel_size = int(os.getenv("num_gpus", "1")) + dtype = os.getenv("dtype", "auto") max_loras = 1 enable_lora = False self.lora_requests = [] @@ -53,6 +57,8 @@ def __init__(self, model_path) -> None: enable_lora=enable_lora, max_loras=max_loras, enable_prefix_caching=enable_prefix_caching, + dtype=dtype, + max_model_len=int(max_model_len) if max_model_len else None, ) self.engine = AsyncLLMEngine.from_engine_args(self.engine_args) diff --git a/gpt_server/model_worker/base/model_worker_base.py b/gpt_server/model_worker/base/model_worker_base.py index 17de9ee..72eee86 100644 --- a/gpt_server/model_worker/base/model_worker_base.py +++ b/gpt_server/model_worker/base/model_worker_base.py @@ -184,7 +184,9 @@ def run(cls): parser.add_argument( "--controller_address", type=str, default="http://localhost:21001" ) - parser.add_argument("--enable_prefix_caching", type=str, default=None) + parser.add_argument("--enable_prefix_caching", type=str, default="False") + parser.add_argument("--dtype", type=str, default="auto") + parser.add_argument("--max_model_len", type=str, default=None) args = parser.parse_args() os.environ["num_gpus"] = str(args.num_gpus) @@ -196,10 +198,14 @@ def run(cls): os.environ["backend"] = "lmdeploy-pytorch" elif args.backend == "lmdeploy-turbomind": os.environ["backend"] = "lmdeploy-turbomind" + if args.lora: os.environ["lora"] = args.lora - if args.enable_prefix_caching: - os.environ["enable_prefix_caching"] = args.enable_prefix_caching + if args.max_model_len: + os.environ["max_model_len"] = args.max_model_len + + os.environ["enable_prefix_caching"] = args.enable_prefix_caching + os.environ["dtype"] = args.dtype host = args.host controller_address = args.controller_address diff --git a/gpt_server/script/config.yaml b/gpt_server/script/config.yaml index fb6db79..3207f20 100644 --- a/gpt_server/script/config.yaml +++ b/gpt_server/script/config.yaml @@ -11,17 +11,17 @@ model_worker_args: host: 0.0.0.0 controller_address: http://localhost:21001 models: -- qwenvl: +- minicpmv: alias: null enable: false - model_name_or_path: /home/dev/model/qwen/Qwen2-VL-7B-Instruct/ - model_type: qwen + model_name_or_path: /home/dev/model/OpenBMB/MiniCPM-V-2_6/ + model_type: minicpmv work_mode: lmdeploy-turbomind enable_prefix_caching: false device: gpu workers: - gpus: - - 0 + - 3 - internvl2: alias: null enable: false @@ -32,7 +32,7 @@ models: device: gpu workers: - gpus: - - 0 + - 3 - chatglm4: alias: chatglm3 enable: true @@ -43,18 +43,7 @@ models: device: gpu workers: - gpus: - - 0 -- qwen: - alias: gpt-4,gpt-3.5-turbo,gpt-3.5-turbo-16k - enable: false - model_name_or_path: /home/dev/model/qwen/Qwen2___5-7B-Instruct/ - model_type: qwen - work_mode: lmdeploy-turbomind - enable_prefix_caching: false - device: gpu - workers: - - gpus: - - 1 + - 3 - qwen-72b: alias: qwen,gpt-4,gpt-3.5-turbo,gpt-3.5-turbo-16k enable: true @@ -64,54 +53,14 @@ models: enable_prefix_caching: true device: gpu workers: - - gpus: - - 3 - - 1 -- mixtral: - alias: null - enable: false - model_name_or_path: /home/dev/model/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/ - model_type: qwen - work_mode: vllm - device: gpu - workers: - - gpus: - - 3 - - 0 -- llama3: - alias: null - enable: false - model_name_or_path: /home/dev/model/unsloth/unsloth/llama-3-8b-Instruct/ - model_type: llama - work_mode: hf - device: gpu - workers: - - gpus: - - 0 -- yi: - alias: null - enable: false - model_name_or_path: /home/dev/model/01ai/Yi-34B-Chat/ - model_type: yi - work_mode: hf - device: gpu - workers: - - gpus: - - 2 -- internlm2: - alias: null - enable: false - model_name_or_path: /home/dev/model/Shanghai_AI_Laboratory/internlm2_5-7b-chat/ - model_type: internlm - work_mode: hf - device: gpu - workers: - gpus: - 0 + - 1 - piccolo-base-zh: alias: null enable: true - model_name_or_path: /home/dev/model/assets/embeddings/sensenova/piccolo-base-zh/ + model_config: + model_name_or_path: /home/dev/model/assets/embeddings/sensenova/piccolo-base-zh/ model_type: embedding_infinity work_mode: hf device: gpu @@ -121,17 +70,8 @@ models: - bce-embedding-base_v1: alias: text-embedding-ada-002 enable: true - model_name_or_path: /home/dev/model/maidalun1020/bce-embedding-base_v1/ - model_type: embedding_infinity - work_mode: hf - device: gpu - workers: - - gpus: - - 2 -- conan: - alias: null - enable: true - model_name_or_path: /home/dev/model/model1001/Conan/ + model_config: + model_name_or_path: /home/dev/model/maidalun1020/bce-embedding-base_v1/ model_type: embedding_infinity work_mode: hf device: gpu @@ -141,7 +81,8 @@ models: - bge-reranker-base: alias: null enable: true - model_name_or_path: /home/dev/model/Xorbits/bge-reranker-base/ + model_config: + model_name_or_path: /home/dev/model/Xorbits/bge-reranker-base/ model_type: embedding_infinity work_mode: hf device: gpu @@ -151,17 +92,8 @@ models: - acge_text_embedding: alias: text-embedding-ada-002 enable: true - model_name_or_path: /home/dev/model/aspire/acge_text_embedding - model_type: embedding_infinity - work_mode: hf - device: gpu - workers: - - gpus: - - 2 -- xiaobu-embedding: - alias: null - enable: true - model_name_or_path: /home/dev/model/lier007/xiaobu-embedding-v2/ + model_config: + model_name_or_path: /home/dev/model/aspire/acge_text_embedding model_type: embedding_infinity work_mode: hf device: gpu diff --git a/gpt_server/serving/server_ui.py b/gpt_server/serving/server_ui.py index b8a7b69..e6b510b 100644 --- a/gpt_server/serving/server_ui.py +++ b/gpt_server/serving/server_ui.py @@ -133,6 +133,7 @@ def model_worker_args(): for model_name, model_config in model_config_.items(): if model_tab.split("|")[0].strip() == model_name: enable_state = model_config["enable"] + engine_config = model_config.get("model_config", None) left, right = st.columns(2) with left: @@ -217,7 +218,7 @@ def on_change(): "enable_prefix_caching", options := [True, False], index=options.index( - model_config.get("enable_prefix_caching", False) + engine_config.get("enable_prefix_caching", False) ), key=f"enable_prefix_caching_{i}", on_change=on_change, @@ -272,7 +273,7 @@ def on_change(): model_name_or_path = st.text_input( "model_name_or_path", - model_config["model_name_or_path"], + engine_config["model_name_or_path"], key=f"model_name_or_path_{i}", on_change=on_change, ) diff --git a/gpt_server/utils.py b/gpt_server/utils.py index 3dd97ff..57ed248 100644 --- a/gpt_server/utils.py +++ b/gpt_server/utils.py @@ -1,6 +1,7 @@ import socket from typing import List, Optional import os +import sys import json from multiprocessing import Process import subprocess @@ -70,12 +71,41 @@ def start_model_worker(config: dict): if model_config["enable"]: # pprint(model_config) print() - # 模型地址 - model_name_or_path = model_config["model_name_or_path"] + engine_config = model_config.get("model_config", None) + # TODO -------------- 向前兼容 -------------- + if engine_config: + # 新版本 + # 模型地址 + model_name_or_path = engine_config["model_name_or_path"] + enable_prefix_caching = engine_config.get( + "enable_prefix_caching", "False" + ) + dtype = engine_config.get("dtype", "auto") + lora = engine_config.get("lora", None) + max_model_len = engine_config.get("max_model_len", None) + else: + logger.error( + f"""模型: {model_name}的 model_name_or_path,model_name_or_path 参数的配置必须修改到 model_config 下面!形如: +- minicpmv: + alias: null + enable: false + model_type: minicpmv + model_config: + model_name_or_path: /home/dev/model/OpenBMB/MiniCPM-V-2_6/ + enable_prefix_caching: false + dtype: auto + work_mode: lmdeploy-turbomind + device: gpu + workers: + - gpus: + - 3 + """ + ) + sys.exit() + + # -------------- 向前兼容 -------------- # 模型类型 model_type = model_config["model_type"] - lora = model_config.get("lora", None) - enable_prefix_caching = model_config.get("enable_prefix_caching", False) # model type 校验 # py_path = f"{root_dir}/gpt_server/model_worker/{model_type}.py" py_path = f"-m gpt_server.model_worker.{model_type}" @@ -120,12 +150,14 @@ def start_model_worker(config: dict): + f" --backend {backend}" + f" --host {host}" + f" --controller_address {controller_address}" + + f" --dtype {dtype}" + + f" --enable_prefix_caching {enable_prefix_caching}" # 是否开启 prefix cache ) + # 处理为 None的情况 if lora: cmd += f" --lora '{json.dumps(lora)}'" - if enable_prefix_caching: # 是否开启 prefix cache - cmd += f" --enable_prefix_caching {enable_prefix_caching}" - + if max_model_len: + cmd += f" --max_model_len '{max_model_len}'" p = Process(target=run_cmd, args=(cmd,)) p.start() process.append(p)