Skip to content

Commit

Permalink
修改配置文件的架构
Browse files Browse the repository at this point in the history
  • Loading branch information
shell-nlp committed Oct 26, 2024
1 parent 4e76591 commit dbac077
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 96 deletions.
4 changes: 3 additions & 1 deletion gpt_server/model_backend/lmdeploy_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@ class LMDeployBackend(ModelBackend):
def __init__(self, model_path) -> None:
backend = backend_map[os.getenv("backend")]
enable_prefix_caching = bool(os.getenv("enable_prefix_caching", False))

max_model_len = os.getenv("max_model_len", None)
logger.info(f"后端 {backend}")
if backend == "pytorch":
backend_config = PytorchEngineConfig(tp=int(os.getenv("num_gpus", "1")))
if backend == "turbomind":
backend_config = TurbomindEngineConfig(
tp=int(os.getenv("num_gpus", "1")),
enable_prefix_caching=enable_prefix_caching,
session_len=int(max_model_len) if max_model_len else None,
)
pipeline_type, pipeline_class = get_task(model_path)
logger.info(f"模型架构:{pipeline_type}")
Expand Down Expand Up @@ -78,6 +79,7 @@ async def stream_chat(self, params: Dict[str, Any]) -> AsyncGenerator:
top_k=50 if top_k == -1 else top_k,
stop_words=list(stop),
skip_special_tokens=True,
response_format=params["response_format"],
)
logger.info(f"request_id {int(request_id)}")
results_generator = self.async_engine.generate(
Expand Down
6 changes: 6 additions & 0 deletions gpt_server/model_backend/vllm_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ class VllmBackend(ModelBackend):
def __init__(self, model_path) -> None:
lora = os.getenv("lora", None)
enable_prefix_caching = bool(os.getenv("enable_prefix_caching", False))

max_model_len = os.getenv("max_model_len", None)

tensor_parallel_size = int(os.getenv("num_gpus", "1"))
dtype = os.getenv("dtype", "auto")
max_loras = 1
enable_lora = False
self.lora_requests = []
Expand All @@ -53,6 +57,8 @@ def __init__(self, model_path) -> None:
enable_lora=enable_lora,
max_loras=max_loras,
enable_prefix_caching=enable_prefix_caching,
dtype=dtype,
max_model_len=int(max_model_len) if max_model_len else None,
)
self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)

Expand Down
12 changes: 9 additions & 3 deletions gpt_server/model_worker/base/model_worker_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,9 @@ def run(cls):
parser.add_argument(
"--controller_address", type=str, default="http://localhost:21001"
)
parser.add_argument("--enable_prefix_caching", type=str, default=None)
parser.add_argument("--enable_prefix_caching", type=str, default="False")
parser.add_argument("--dtype", type=str, default="auto")
parser.add_argument("--max_model_len", type=str, default=None)

args = parser.parse_args()
os.environ["num_gpus"] = str(args.num_gpus)
Expand All @@ -196,10 +198,14 @@ def run(cls):
os.environ["backend"] = "lmdeploy-pytorch"
elif args.backend == "lmdeploy-turbomind":
os.environ["backend"] = "lmdeploy-turbomind"

if args.lora:
os.environ["lora"] = args.lora
if args.enable_prefix_caching:
os.environ["enable_prefix_caching"] = args.enable_prefix_caching
if args.max_model_len:
os.environ["max_model_len"] = args.max_model_len

os.environ["enable_prefix_caching"] = args.enable_prefix_caching
os.environ["dtype"] = args.dtype

host = args.host
controller_address = args.controller_address
Expand Down
98 changes: 15 additions & 83 deletions gpt_server/script/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@ model_worker_args:
host: 0.0.0.0
controller_address: http://localhost:21001
models:
- qwenvl:
- minicpmv:
alias: null
enable: false
model_name_or_path: /home/dev/model/qwen/Qwen2-VL-7B-Instruct/
model_type: qwen
model_name_or_path: /home/dev/model/OpenBMB/MiniCPM-V-2_6/
model_type: minicpmv
work_mode: lmdeploy-turbomind
enable_prefix_caching: false
device: gpu
workers:
- gpus:
- 0
- 3
- internvl2:
alias: null
enable: false
Expand All @@ -32,7 +32,7 @@ models:
device: gpu
workers:
- gpus:
- 0
- 3
- chatglm4:
alias: chatglm3
enable: true
Expand All @@ -43,18 +43,7 @@ models:
device: gpu
workers:
- gpus:
- 0
- qwen:
alias: gpt-4,gpt-3.5-turbo,gpt-3.5-turbo-16k
enable: false
model_name_or_path: /home/dev/model/qwen/Qwen2___5-7B-Instruct/
model_type: qwen
work_mode: lmdeploy-turbomind
enable_prefix_caching: false
device: gpu
workers:
- gpus:
- 1
- 3
- qwen-72b:
alias: qwen,gpt-4,gpt-3.5-turbo,gpt-3.5-turbo-16k
enable: true
Expand All @@ -64,54 +53,14 @@ models:
enable_prefix_caching: true
device: gpu
workers:
- gpus:
- 3
- 1
- mixtral:
alias: null
enable: false
model_name_or_path: /home/dev/model/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/
model_type: qwen
work_mode: vllm
device: gpu
workers:
- gpus:
- 3
- 0
- llama3:
alias: null
enable: false
model_name_or_path: /home/dev/model/unsloth/unsloth/llama-3-8b-Instruct/
model_type: llama
work_mode: hf
device: gpu
workers:
- gpus:
- 0
- yi:
alias: null
enable: false
model_name_or_path: /home/dev/model/01ai/Yi-34B-Chat/
model_type: yi
work_mode: hf
device: gpu
workers:
- gpus:
- 2
- internlm2:
alias: null
enable: false
model_name_or_path: /home/dev/model/Shanghai_AI_Laboratory/internlm2_5-7b-chat/
model_type: internlm
work_mode: hf
device: gpu
workers:
- gpus:
- 0
- 1
- piccolo-base-zh:
alias: null
enable: true
model_name_or_path: /home/dev/model/assets/embeddings/sensenova/piccolo-base-zh/
model_config:
model_name_or_path: /home/dev/model/assets/embeddings/sensenova/piccolo-base-zh/
model_type: embedding_infinity
work_mode: hf
device: gpu
Expand All @@ -121,17 +70,8 @@ models:
- bce-embedding-base_v1:
alias: text-embedding-ada-002
enable: true
model_name_or_path: /home/dev/model/maidalun1020/bce-embedding-base_v1/
model_type: embedding_infinity
work_mode: hf
device: gpu
workers:
- gpus:
- 2
- conan:
alias: null
enable: true
model_name_or_path: /home/dev/model/model1001/Conan/
model_config:
model_name_or_path: /home/dev/model/maidalun1020/bce-embedding-base_v1/
model_type: embedding_infinity
work_mode: hf
device: gpu
Expand All @@ -141,7 +81,8 @@ models:
- bge-reranker-base:
alias: null
enable: true
model_name_or_path: /home/dev/model/Xorbits/bge-reranker-base/
model_config:
model_name_or_path: /home/dev/model/Xorbits/bge-reranker-base/
model_type: embedding_infinity
work_mode: hf
device: gpu
Expand All @@ -151,17 +92,8 @@ models:
- acge_text_embedding:
alias: text-embedding-ada-002
enable: true
model_name_or_path: /home/dev/model/aspire/acge_text_embedding
model_type: embedding_infinity
work_mode: hf
device: gpu
workers:
- gpus:
- 2
- xiaobu-embedding:
alias: null
enable: true
model_name_or_path: /home/dev/model/lier007/xiaobu-embedding-v2/
model_config:
model_name_or_path: /home/dev/model/aspire/acge_text_embedding
model_type: embedding_infinity
work_mode: hf
device: gpu
Expand Down
5 changes: 3 additions & 2 deletions gpt_server/serving/server_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def model_worker_args():
for model_name, model_config in model_config_.items():
if model_tab.split("|")[0].strip() == model_name:
enable_state = model_config["enable"]
engine_config = model_config.get("model_config", None)
left, right = st.columns(2)
with left:

Expand Down Expand Up @@ -217,7 +218,7 @@ def on_change():
"enable_prefix_caching",
options := [True, False],
index=options.index(
model_config.get("enable_prefix_caching", False)
engine_config.get("enable_prefix_caching", False)
),
key=f"enable_prefix_caching_{i}",
on_change=on_change,
Expand Down Expand Up @@ -272,7 +273,7 @@ def on_change():

model_name_or_path = st.text_input(
"model_name_or_path",
model_config["model_name_or_path"],
engine_config["model_name_or_path"],
key=f"model_name_or_path_{i}",
on_change=on_change,
)
Expand Down
46 changes: 39 additions & 7 deletions gpt_server/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import socket
from typing import List, Optional
import os
import sys
import json
from multiprocessing import Process
import subprocess
Expand Down Expand Up @@ -70,12 +71,41 @@ def start_model_worker(config: dict):
if model_config["enable"]:
# pprint(model_config)
print()
# 模型地址
model_name_or_path = model_config["model_name_or_path"]
engine_config = model_config.get("model_config", None)
# TODO -------------- 向前兼容 --------------
if engine_config:
# 新版本
# 模型地址
model_name_or_path = engine_config["model_name_or_path"]
enable_prefix_caching = engine_config.get(
"enable_prefix_caching", "False"
)
dtype = engine_config.get("dtype", "auto")
lora = engine_config.get("lora", None)
max_model_len = engine_config.get("max_model_len", None)
else:
logger.error(
f"""模型: {model_name}的 model_name_or_path,model_name_or_path 参数的配置必须修改到 model_config 下面!形如:
- minicpmv:
alias: null
enable: false
model_type: minicpmv
model_config:
model_name_or_path: /home/dev/model/OpenBMB/MiniCPM-V-2_6/
enable_prefix_caching: false
dtype: auto
work_mode: lmdeploy-turbomind
device: gpu
workers:
- gpus:
- 3
"""
)
sys.exit()

# -------------- 向前兼容 --------------
# 模型类型
model_type = model_config["model_type"]
lora = model_config.get("lora", None)
enable_prefix_caching = model_config.get("enable_prefix_caching", False)
# model type 校验
# py_path = f"{root_dir}/gpt_server/model_worker/{model_type}.py"
py_path = f"-m gpt_server.model_worker.{model_type}"
Expand Down Expand Up @@ -120,12 +150,14 @@ def start_model_worker(config: dict):
+ f" --backend {backend}"
+ f" --host {host}"
+ f" --controller_address {controller_address}"
+ f" --dtype {dtype}"
+ f" --enable_prefix_caching {enable_prefix_caching}" # 是否开启 prefix cache
)
# 处理为 None的情况
if lora:
cmd += f" --lora '{json.dumps(lora)}'"
if enable_prefix_caching: # 是否开启 prefix cache
cmd += f" --enable_prefix_caching {enable_prefix_caching}"

if max_model_len:
cmd += f" --max_model_len '{max_model_len}'"
p = Process(target=run_cmd, args=(cmd,))
p.start()
process.append(p)
Expand Down

0 comments on commit dbac077

Please sign in to comment.