Skip to content

Commit

Permalink
更新vllm==0.6.6.post1
Browse files Browse the repository at this point in the history
  • Loading branch information
shell-nlp committed Dec 29, 2024
1 parent 29090cc commit 2d4fff9
Show file tree
Hide file tree
Showing 7 changed files with 461 additions and 39 deletions.
23 changes: 21 additions & 2 deletions gpt_server/model_backend/utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,34 @@
from typing import List
from typing import List, Type, Union
from pydantic import BaseModel
from transformers.generation.logits_process import LogitsProcessor
from transformers import PreTrainedTokenizerBase
from transformers.generation.stopping_criteria import (
StoppingCriteria,
StoppingCriteriaList,
STOPPING_CRITERIA_INPUTS_DOCSTRING,
add_start_docstrings,
)

import xgrammar as xgr
import torch


class XgrammarLogitsProcessor(LogitsProcessor):
def __init__(self, tokenizer: PreTrainedTokenizerBase):
tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer)
self.grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
# -----------

def get_grammar_compiler(self, schema: Union[str, Type[BaseModel]]):
compiled_grammar = self.grammar_compiler.compile_json_schema(schema)
self.xgr_logits_processor = xgr.contrib.hf.LogitsProcessor(compiled_grammar)
return self.xgr_logits_processor

def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor
) -> torch.FloatTensor:
return self.xgr_logits_processor(input_ids=input_ids, scores=scores)


class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor
Expand Down
2 changes: 1 addition & 1 deletion gpt_server/model_backend/vllm_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def stream_chat(self, params: Dict[str, Any]) -> AsyncGenerator:
choice=None,
grammar=None,
json_object=guided_json_object,
backend="lm-format-enforcer",
backend="xgrammar",
whitespace_pattern=None,
)
# ---- 支持 response_format,但是官方对BPE分词器的支持仍然太差 ----
Expand Down
54 changes: 31 additions & 23 deletions gpt_server/model_worker/embedding_infinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from loguru import logger

from infinity_emb import AsyncEngineArray, EngineArgs, AsyncEmbeddingEngine
from infinity_emb.inference.select_model import get_engine_type_from_config
from gpt_server.model_worker.base.model_worker_base import ModelWorkerBase

label_to_category = {
Expand Down Expand Up @@ -49,30 +50,26 @@ def __init__(
bettertransformer = True
if model_type is not None and "deberta" in model_type:
bettertransformer = False
self.engine: AsyncEmbeddingEngine = AsyncEngineArray.from_args(
[
EngineArgs(
model_name_or_path=model_path,
engine="torch",
embedding_dtype="float32",
dtype="float32",
device=device,
bettertransformer=bettertransformer,
)
]
)[0]
engine_args = EngineArgs(
model_name_or_path=model_path,
engine="torch",
embedding_dtype="float32",
dtype="float32",
device=device,
bettertransformer=bettertransformer,
)
engine_type = get_engine_type_from_config(engine_args)
engine_type_str = str(engine_type)
if "EmbedderEngine" in engine_type_str:
self.mode = "embedding"
elif "RerankEngine" in engine_type_str:
self.mode = "rerank"
elif "ImageEmbedEngine" in engine_type_str:
self.mode = "image"
self.engine: AsyncEmbeddingEngine = AsyncEngineArray.from_args([engine_args])[0]
loop = asyncio.get_running_loop()
loop.create_task(self.engine.astart())
self.mode = "embedding"
# rerank
for model_name in model_names:
if "rerank" in model_name:
self.mode = "rerank"
break
if self.mode == "rerank":
logger.info("正在使用 rerank 模型...")
elif self.mode == "embedding":
logger.info("正在使用 embedding 模型...")
logger.info(f"正在使用 {self.mode} 模型...")
logger.info(f"模型:{model_names[0]}")

async def astart(self):
Expand All @@ -83,7 +80,7 @@ async def get_embeddings(self, params):
logger.info(f"worker_id: {self.worker_id}")
self.call_ct += 1
ret = {"embedding": [], "token_num": 0}
texts = params["input"]
texts: list = params["input"]
if self.mode == "embedding":
texts = list(map(lambda x: x.replace("\n", " "), texts))
embeddings, usage = await self.engine.embed(sentences=texts)
Expand All @@ -105,6 +102,17 @@ async def get_embeddings(self, params):
embedding = [
[round(float(score["relevance_score"]), 6)] for score in ranking
]
elif self.mode == "image":
if (
isinstance(texts[0], bytes)
or "http" in texts[0]
or "data:image" in texts[0]
):
embeddings, usage = await self.engine.image_embed(images=texts)
else:
embeddings, usage = await self.engine.embed(sentences=texts)

embedding = [embedding.tolist() for embedding in embeddings]
ret["embedding"] = embedding
ret["token_num"] = usage
return ret
Expand Down
3 changes: 2 additions & 1 deletion gpt_server/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def kill_child_processes(parent_pid, including_parent=False):

def signal_handler(signum, frame):
print("\nCtrl-C detected! Cleaning up...")
kill_child_processes(parent_pid, including_parent=False)
# kill_child_processes(parent_pid, including_parent=False)
stop_server()
exit(0) # 正常退出程序


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ dependencies = [
"torch==2.5.1",
"torchvision==0.20.1",
"transformers==4.45.2",
"vllm==0.6.5",
"vllm==0.6.6.post1",
"qwen_vl_utils",
"evalscope[perf]==0.7.0",
"modelscope==1.20.1",
Expand Down
7 changes: 3 additions & 4 deletions tests/test_openai_completion_response_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,17 @@

# 新版本 opnai
client = OpenAI(api_key="EMPTY", base_url="http://localhost:8082/v1")

# 方式一
output = client.chat.completions.create(
model="qwen-32b",
model="qwen-3b",
messages=[{"role": "user", "content": "南京到北京多远"}],
response_format={"type": "text"},
)
print(output.choices[0].message.content)
print("-" * 100)
# 方式二
output = client.chat.completions.create(
model="qwen-32b",
model="qwen-3b",
messages=[
{"role": "system", "content": "用json进行回答"},
{"role": "user", "content": "南京到北京多远"},
Expand All @@ -32,7 +31,7 @@ class Distance(BaseModel):


output = client.beta.chat.completions.parse(
model="qwen-32b",
model="qwen-3b",
messages=[{"role": "user", "content": "南京到北京多远"}],
response_format=Distance,
)
Expand Down
Loading

0 comments on commit 2d4fff9

Please sign in to comment.