Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refact embedding/ranking/llm request/response by referring to openai format #405

Merged
merged 8 commits into from
Aug 12, 2024
1 change: 1 addition & 0 deletions comps/cores/mega/gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
temperature=chat_request.temperature if chat_request.temperature else 0.01,
repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
streaming=stream_opt,
chat_template=chat_request.chat_template if chat_request.chat_template else None,

Check warning on line 121 in comps/cores/mega/gateway.py

View check run for this annotation

Codecov / codecov/patch

comps/cores/mega/gateway.py#L121

Added line #L121 was not covered by tests
)
result_dict, runtime_graph = await self.megaservice.schedule(
initial_inputs={"text": prompt}, llm_parameters=parameters
Expand Down
268 changes: 226 additions & 42 deletions comps/cores/proto/api_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,243 @@ class UsageInfo(BaseModel):
completion_tokens: Optional[int] = 0


class ResponseFormat(BaseModel):
# type must be "json_object" or "text"
type: Literal["text", "json_object"]


class StreamOptions(BaseModel):
# refer https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/protocol.py#L105
include_usage: Optional[bool]


class FunctionDefinition(BaseModel):
name: str
description: Optional[str] = None
parameters: Optional[Dict[str, Any]] = None


class ChatCompletionToolsParam(BaseModel):
type: Literal["function"] = "function"
function: FunctionDefinition


class ChatCompletionNamedFunction(BaseModel):
name: str


class ChatCompletionNamedToolChoiceParam(BaseModel):
function: ChatCompletionNamedFunction
type: Literal["function"] = "function"


class TokenCheckRequestItem(BaseModel):
model: str
prompt: str
max_tokens: int


class TokenCheckRequest(BaseModel):
prompts: List[TokenCheckRequestItem]


class TokenCheckResponseItem(BaseModel):
fits: bool
tokenCount: int
contextLength: int


class TokenCheckResponse(BaseModel):
prompts: List[TokenCheckResponseItem]


class EmbeddingRequest(BaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/embeddings
model: Optional[str] = None
input: Union[List[int], List[List[int]], str, List[str]]
encoding_format: Optional[str] = Field("float", pattern="^(float|base64)$")
dimensions: Optional[int] = None
user: Optional[str] = None

# define
request_type: Literal["embedding"] = "embedding"


class EmbeddingResponseData(BaseModel):
index: int
object: str = "embedding"
embedding: Union[List[float], str]


class EmbeddingResponse(BaseModel):
object: str = "list"
model: Optional[str] = None
data: List[EmbeddingResponseData]
usage: Optional[UsageInfo] = None


class RetrievalRequest(BaseModel):
embedding: Union[EmbeddingResponse, List[float]] = None
input: Optional[str] = None # search_type maybe need, like "mmr"
search_type: str = "similarity"
k: int = 4
distance_threshold: Optional[float] = None
fetch_k: int = 20
lambda_mult: float = 0.5
score_threshold: float = 0.2

# define
request_type: Literal["retrieval"] = "retrieval"


class RetrievalResponseData(BaseModel):
text: str
metadata: Optional[Dict[str, Any]] = None


class RetrievalResponse(BaseModel):
retrieved_docs: List[RetrievalResponseData]


class RerankingRequest(BaseModel):
input: str
retrieved_docs: Union[List[RetrievalResponseData], List[Dict[str, Any]], List[str]]
top_n: int = 1

# define
request_type: Literal["reranking"] = "reranking"


class RerankingResponseData(BaseModel):
text: str
score: Optional[float] = 0.0


class RerankingResponse(BaseModel):
reranked_docs: List[RerankingResponseData]


class ChatCompletionRequest(BaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/chat/create
messages: Union[
str,
List[Dict[str, str]],
List[Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]],
]
model: Optional[str] = "Intel/neural-chat-7b-v3-3"
temperature: Optional[float] = 0.01
top_p: Optional[float] = 0.95
top_k: Optional[int] = 10
frequency_penalty: Optional[float] = 0.0
logit_bias: Optional[Dict[str, float]] = None
logprobs: Optional[bool] = False
top_logprobs: Optional[int] = 0
max_tokens: Optional[int] = 16 # use https://platform.openai.com/docs/api-reference/completions/create
n: Optional[int] = 1
max_tokens: Optional[int] = 1024
stop: Optional[Union[str, List[str]]] = None
presence_penalty: Optional[float] = 0.0
response_format: Optional[ResponseFormat] = None
seed: Optional[int] = None
service_tier: Optional[str] = None
stop: Union[str, List[str], None] = Field(default_factory=list)
stream: Optional[bool] = False
presence_penalty: Optional[float] = 1.03
frequency_penalty: Optional[float] = 0.0
stream_options: Optional[StreamOptions] = None
temperature: Optional[float] = 1.0 # vllm default 0.7
top_p: Optional[float] = None # openai default 1.0, but tgi needs `top_p` must be > 0.0 and < 1.0, set None
tools: Optional[List[ChatCompletionToolsParam]] = None
tool_choice: Optional[Union[Literal["none"], ChatCompletionNamedToolChoiceParam]] = "none"
parallel_tool_calls: Optional[bool] = True
user: Optional[str] = None

# Ordered by official OpenAI API documentation
# default values are same with
# https://platform.openai.com/docs/api-reference/completions/create
best_of: Optional[int] = 1
suffix: Optional[str] = None

# vllm reference: https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/protocol.py#L130
repetition_penalty: Optional[float] = 1.0

# tgi reference: https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/generate
# some tgi parameters in use
# default values are same with
# https://github.com/huggingface/text-generation-inference/blob/main/router/src/lib.rs#L190
# max_new_tokens: Optional[int] = 100 # Priority use openai
top_k: Optional[int] = None
# top_p: Optional[float] = None # Priority use openai
typical_p: Optional[float] = None
# repetition_penalty: Optional[float] = None

# doc: begin-chat-completion-extra-params
echo: Optional[bool] = Field(
default=False,
description=(
"If true, the new message will be prepended with the last message " "if they belong to the same role."
),
)
add_generation_prompt: Optional[bool] = Field(
default=True,
description=(
"If true, the generation prompt will be added to the chat template. "
"This is a parameter used by chat template in tokenizer config of the "
"model."
),
)
add_special_tokens: Optional[bool] = Field(
default=False,
description=(
"If true, special tokens (e.g. BOS) will be added to the prompt "
"on top of what is added by the chat template. "
"For most models, the chat template takes care of adding the "
"special tokens so this should be set to False (as is the "
"default)."
),
)
documents: Optional[Union[List[Dict[str, str]], List[str]]] = Field(
default=None,
description=(
"A list of dicts representing documents that will be accessible to "
"the model if it is performing RAG (retrieval-augmented generation)."
" If the template does not support RAG, this argument will have no "
"effect. We recommend that each document should be a dict containing "
'"title" and "text" keys.'
),
)
chat_template: Optional[str] = Field(
default=None,
description=(
"A template to use for this conversion. "
"If this is not passed, the model's default chat template will be "
"used instead. We recommend that the template contains {context} and {question} for rag,"
"or only contains {question} for chat completion without rag."
),
)
chat_template_kwargs: Optional[Dict[str, Any]] = Field(
default=None,
description=("Additional kwargs to pass to the template renderer. " "Will be accessible by the chat template."),
)
# doc: end-chat-completion-extra-params

# embedding
input: Union[List[int], List[List[int]], str, List[str]] = None # user query/question from messages[-]
encoding_format: Optional[str] = Field("float", pattern="^(float|base64)$")
dimensions: Optional[int] = None
embedding: Union[EmbeddingResponse, List[float]] = Field(default_factory=list)

# retrieval
search_type: str = "similarity"
k: int = 4
distance_threshold: Optional[float] = None
fetch_k: int = 20
lambda_mult: float = 0.5
score_threshold: float = 0.2
retrieved_docs: Union[List[RetrievalResponseData], List[Dict[str, Any]]] = Field(default_factory=list)

# reranking
top_n: int = 1
reranked_docs: Union[List[RerankingResponseData], List[Dict[str, Any]]] = Field(default_factory=list)

# define
request_type: Literal["chat"] = "chat"


class AudioChatCompletionRequest(BaseModel):
audio: str
Expand Down Expand Up @@ -110,41 +329,6 @@ class ChatCompletionStreamResponse(BaseModel):
choices: List[ChatCompletionResponseStreamChoice]


class TokenCheckRequestItem(BaseModel):
model: str
prompt: str
max_tokens: int


class TokenCheckRequest(BaseModel):
prompts: List[TokenCheckRequestItem]


class TokenCheckResponseItem(BaseModel):
fits: bool
tokenCount: int
contextLength: int


class TokenCheckResponse(BaseModel):
prompts: List[TokenCheckResponseItem]


class EmbeddingsRequest(BaseModel):
model: Optional[str] = None
engine: Optional[str] = None
input: Union[str, List[Any]]
user: Optional[str] = None
encoding_format: Optional[str] = None


class EmbeddingsResponse(BaseModel):
object: str = "list"
data: List[Dict[str, Any]]
model: str
usage: UsageInfo


class CompletionRequest(BaseModel):
model: str
prompt: Union[str, List[Any]]
Expand Down
38 changes: 36 additions & 2 deletions comps/cores/proto/docarray.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from typing import Optional
from typing import Dict, List, Optional, Union

import numpy as np
from docarray import BaseDoc, DocList
from docarray.documents import AudioDoc
from docarray.typing import AudioUrl
from pydantic import Field, conint, conlist
from pydantic import Field, conint, conlist, field_validator


class TopologyInfo:
Expand Down Expand Up @@ -88,6 +88,30 @@
repetition_penalty: float = 1.03
streaming: bool = True

chat_template: Optional[str] = Field(
default=None,
description=(
"A template to use for this conversion. "
"If this is not passed, the model's default chat template will be "
"used instead. We recommend that the template contains {context} and {question} for rag,"
"or only contains {question} for chat completion without rag."
),
)
documents: Optional[Union[List[Dict[str, str]], List[str]]] = Field(
default=[],
description=(
"A list of dicts representing documents that will be accessible to "
"the model if it is performing RAG (retrieval-augmented generation)."
" If the template does not support RAG, this argument will have no "
"effect. We recommend that each document should be a dict containing "
'"title" and "text" keys.'
),
)

@field_validator("chat_template")
def chat_template_must_contain_variables(cls, v):
return v

Check warning on line 113 in comps/cores/proto/docarray.py

View check run for this annotation

Codecov / codecov/patch

comps/cores/proto/docarray.py#L113

Added line #L113 was not covered by tests


class LLMParams(BaseDoc):
max_new_tokens: int = 1024
Expand All @@ -98,6 +122,16 @@
repetition_penalty: float = 1.03
streaming: bool = True

chat_template: Optional[str] = Field(
default=None,
description=(
"A template to use for this conversion. "
"If this is not passed, the model's default chat template will be "
"used instead. We recommend that the template contains {context} and {question} for rag,"
"or only contains {question} for chat completion without rag."
),
)


class RAGASParams(BaseDoc):
questions: DocList[TextDoc]
Expand Down
Loading
Loading