Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: triton generate support #675

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions src/c++/perf_analyzer/client_backend/openai/openai_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ namespace openai {
void
ChatCompletionRequest::SendResponse(bool is_final, bool is_null)
{
// if final response has already been sent
debermudez marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The classes in this file should be renamed since they aren't specific to Chat Completions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think "HTTP with SSE Support" is in the end what it is .... not sure the best name.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd really like to see the classes refactored. We shouldn't need two independent full http clients. Either one goes away, or we get a base class and then some really thin implementation classes on top. We already have stories for this (TMA-1644), so no big deal if this is ignored in this PR.

// due to detecting the [DONE]
// ignore final response due to request completion
if (final_response_sent_) {
return;
}

final_response_sent_ = is_final;
response_callback_(new ChatCompletionResult(
http_code_, std::move(response_buffer_), is_final, is_null, request_id_));
}
Expand Down Expand Up @@ -172,9 +180,11 @@ ChatCompletionClient::AsyncInfer(
request->timer_.CaptureTimestamp(
triton::client::RequestTimers::Kind::REQUEST_END);
UpdateInferStat(request->timer_);
if (!request->is_stream_) {
request->SendResponse(true /* is_final */, false /* is_null */);
}

// Send Response checks if a final
// response has already been sent
// (in the case of seeing [DONE] in streaming case)
request->SendResponse(true /* is_final */, false /* is_null */);
};
std::unique_ptr<HttpRequest> request(new ChatCompletionRequest(
std::move(completion_callback), std::move(callback), request_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class ChatCompletionRequest : public HttpRequest {
// The timers for infer request.
triton::client::RequestTimers timer_;
const std::string request_id_;
bool final_response_sent_{false};
};

class ChatCompletionClient : public HttpClient {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class PromptSource(Enum):
class OutputFormat(Enum):
OPENAI_CHAT_COMPLETIONS = auto()
OPENAI_COMPLETIONS = auto()
TRITON_GENERATE = auto()
TENSORRTLLM = auto()
VLLM = auto()

Expand Down Expand Up @@ -231,7 +232,6 @@ def _get_input_dataset_from_url(
url = cls._resolve_url(dataset_name)
configured_url = cls._create_configured_url(url, starting_index, length)
dataset = cls._download_dataset(configured_url)

return dataset

@classmethod
Expand Down Expand Up @@ -364,7 +364,18 @@ def _convert_generic_json_to_output_format(
model_name: list = [],
model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN,
) -> Dict:
if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
if output_format == OutputFormat.TRITON_GENERATE:
output_json = cls._convert_generic_json_to_generate_format(
generic_dataset,
add_model_name,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
model_name,
)
elif output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
output_json = cls._convert_generic_json_to_openai_chat_completions_format(
generic_dataset,
add_model_name,
Expand Down Expand Up @@ -454,6 +465,41 @@ def _convert_generic_json_to_openai_chat_completions_format(

return pa_json

@classmethod
def _convert_generic_json_to_generate_format(
cls,
dataset_json: Dict,
add_model_name: bool,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
model_name: list = [],
model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN,
) -> Dict:
(
system_role_headers,
user_role_headers,
text_input_headers,
) = cls._determine_json_feature_roles(dataset_json)

pa_json = cls._populate_triton_generate_output_json(
dataset_json,
system_role_headers,
user_role_headers,
text_input_headers,
add_model_name,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
model_name,
)

return pa_json

@classmethod
def _convert_generic_json_to_openai_completions_format(
cls,
Expand Down Expand Up @@ -653,6 +699,52 @@ def _populate_openai_chat_completions_output_json(

return pa_json

@classmethod
def _populate_triton_generate_output_json(
cls,
dataset: Dict,
system_role_headers: List[str],
user_role_headers: List[str],
text_input_headers: List[str],
add_model_name: bool,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
model_name: list = [],
model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN,
) -> Dict:
pa_json: dict = {"data": [{"payload": [{}]} for _ in dataset["rows"]]}

for index, entry in enumerate(dataset["rows"]):
for header, content in entry.items():
new_text_input = cls._create_new_text_input(
header,
system_role_headers,
user_role_headers,
text_input_headers,
content,
)
pa_json["data"][index]["payload"][0]["text_input"] = new_text_input

iter_model_name = cls._select_model_name(
model_name, index, model_selection_strategy
)
pa_json = cls._add_optional_tags_to_openai_json(
pa_json,
index,
False,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
iter_model_name,
)

return pa_json

@classmethod
def _populate_openai_completions_output_json(
cls,
Expand Down
24 changes: 22 additions & 2 deletions src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class ResponseFormat(Enum):
OPENAI_CHAT_COMPLETIONS = auto()
OPENAI_COMPLETIONS = auto()
TRITON = auto()
TRITON_GENERATE = auto()


class Metrics:
Expand Down Expand Up @@ -446,6 +447,8 @@ def _get_profile_metadata(self, data: dict) -> None:
self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS
elif data["endpoint"] == "v1/completions":
self._response_format = ResponseFormat.OPENAI_COMPLETIONS
elif "generate" in data["endpoint"]:
self._response_format = ResponseFormat.TRITON_GENERATE
else:
# TPA-66: add PA metadata to handle this case
# When endpoint field is either empty or custom endpoint, fall
Expand Down Expand Up @@ -662,6 +665,8 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]:
input_text = payload["messages"][0]["content"]
elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS:
input_text = payload["prompt"]
elif self._response_format == ResponseFormat.TRITON_GENERATE:
input_text = payload["text_input"]
else:
raise ValueError(
"Failed to parse OpenAI request input in profile export file."
Expand Down Expand Up @@ -689,7 +694,10 @@ def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]]
"""Tokenize the OpenAI response output texts."""
output_texts = []
for output in res_outputs:
text = self._extract_openai_text_output(output["response"])
if self._response_format == ResponseFormat.TRITON_GENERATE:
text = self._extract_generate_text_output(output["response"])
else:
text = self._extract_openai_text_output(output["response"])
output_texts.append(text)
return self._run_tokenizer(output_texts)

Expand All @@ -702,6 +710,15 @@ def _run_tokenizer(self, output_texts: List[str]) -> List[List[int]]:
encodings = self._tokenizer(output_texts)
return [out[1:] for out in encodings.data["input_ids"]]

def _extract_generate_text_output(self, response: str) -> str:
response = remove_sse_prefix(response)

if response == "":
return response

data = json.loads(response)
return data["text_output"]

def _extract_openai_text_output(self, response: str) -> str:
"""Extracts text/content of the OpenAI response object."""
response = remove_sse_prefix(response)
Expand Down Expand Up @@ -731,7 +748,10 @@ def _extract_openai_text_output(self, response: str) -> str:

def _is_openai_empty_response(self, response: str) -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should change the name of the function since it's no longer just openai

"""Returns true if the response is an openai response with no content (or empty content)"""
text = self._extract_openai_text_output(response)
if self._response_format == ResponseFormat.TRITON_GENERATE:
text = self._extract_generate_text_output(response)
else:
text = self._extract_openai_text_output(response)
if text:
return False
return True
76 changes: 35 additions & 41 deletions src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@

logger = logging.getLogger(__name__)

_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions"}
_endpoint_type_map = {
"chat": "v1/chat/completions",
"completions": "v1/completions",
"generate": "v2/models/{MODEL_NAME}/generate",
"kserve": "v2/models/{MODEL_NAME}/infer",
}


def _check_model_args(
Expand Down Expand Up @@ -98,31 +103,31 @@ def _check_conditional_args(
Check for conditional args and raise an error if they are not set.
"""

# Endpoint and output format checks
if args.service_kind == "openai":
if args.endpoint_type is None:
parser.error(
"The --endpoint-type option is required when using the 'openai' service-kind."
)
else:
if args.endpoint_type == "chat":
args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS
elif args.endpoint_type == "completions":
args.output_format = OutputFormat.OPENAI_COMPLETIONS

if args.endpoint is not None:
args.endpoint = args.endpoint.lstrip(" /")
else:
args.endpoint = _endpoint_type_map[args.endpoint_type]
elif args.endpoint_type is not None:
parser.error(
"The --endpoint-type option should only be used when using the 'openai' service-kind."
)

if args.service_kind == "triton":
if args.endpoint_type == "chat":
args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS
args.service_kind = "openai"
elif args.endpoint_type == "completions":
args.output_format = OutputFormat.OPENAI_COMPLETIONS
args.service_kind = "openai"
elif args.endpoint_type == "generate":
args.output_format = OutputFormat.TRITON_GENERATE
args.service_kind = "openai"
elif args.endpoint_type == "kserve":
args.service_kind = "triton"
args = _convert_str_to_enum_entry(args, "backend", OutputFormat)
args.output_format = args.backend

if args.endpoint is not None:
args.endpoint = args.endpoint.lstrip(" /")
else:
if args.model:
model_name = args.model[0]
else:
model_name = ""
args.endpoint = _endpoint_type_map[args.endpoint_type].format(
MODEL_NAME=model_name
)

# Output token distribution checks
if args.output_tokens_mean == LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN:
if args.output_tokens_stddev != LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV:
Expand All @@ -137,7 +142,7 @@ def _check_conditional_args(
if args.service_kind != "triton":
if args.output_tokens_mean_deterministic:
parser.error(
"The --output-tokens-mean-deterministic option is only supported with the Triton service-kind."
"The --output-tokens-mean-deterministic option is only supported with the kserve endpoint type."
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the code be changed to check endpoint_type != kserve? I know that with the current code it is the same result, but it introduces an assumption (endpoint kserve -> service_kind triton) that could trip up a future developer.

)

return args
Expand Down Expand Up @@ -272,7 +277,7 @@ def _add_input_args(parser):
help=f"When using --output-tokens-mean, this flag can be set to "
"improve precision by setting the minimum number of tokens "
"equal to the requested number of tokens. This is currently "
"supported with the Triton service-kind. "
"supported with the kserve endpoint type. "
"Note that there is still some variability in the requested number "
"of output tokens, but GenAi-Perf attempts its best effort with your "
"model to get the right number of output tokens. ",
Expand Down Expand Up @@ -380,10 +385,10 @@ def _add_endpoint_args(parser):
endpoint_group.add_argument(
"--backend",
type=str,
choices=utils.get_enum_names(OutputFormat)[2:],
choices=["tensorrtllm", "vllm"],
debermudez marked this conversation as resolved.
Show resolved Hide resolved
default="tensorrtllm",
required=False,
help=f'When using the "triton" service-kind, '
help=f'When using the "kserve" endpoint type, '
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can generate endpoint not use trtllm vs vllm?

Copy link
Contributor Author

@nnshah1 nnshah1 Jun 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can - I haven't added any different behavior for the different backends. Actually - it has only been tested against vllm at the moment. So this is fair point ...

Let me move this back to draft - plan to test trt-llm in the next week or so

"this is the backend of the model. "
"For the TENSORRT-LLM backend, you currently must set "
"'exclude_input_in_output' to true in the model config to "
Expand All @@ -400,21 +405,10 @@ def _add_endpoint_args(parser):
endpoint_group.add_argument(
"--endpoint-type",
type=str,
choices=["chat", "completions"],
required=False,
help=f"The endpoint-type to send requests to on the "
'server. This is only used with the "openai" service-kind.',
)

endpoint_group.add_argument(
"--service-kind",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Finally one less CLI option 😄 Can we also update the README to reflect the changes in CLI options?

type=str,
choices=["triton", "openai"],
tgerdesnv marked this conversation as resolved.
Show resolved Hide resolved
default="triton",
choices=["chat", "completions", "generate", "kserve"],
default="kserve",
required=False,
help="The kind of service perf_analyzer will "
'generate load for. In order to use "openai", '
"you must specify an api via --endpoint-type.",
help=f"The endpoint-type for requests. Inputs will be formatted according to endpoint-type.",
)

endpoint_group.add_argument(
Expand Down
Loading
Loading