triton-inference-server · nnshah1 · May 11, 2024 · May 9, 2024 · May 24, 2024 · May 25, 2024
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
@@ -63,6 +63,14 @@ namespace openai {
 void
 ChatCompletionRequest::SendResponse(bool is_final, bool is_null)
 {
+  // if final response has already been sent
+  // due to detecting the [DONE]
+  // ignore final response due to request completion
+  if (final_response_sent_) {
+    return;
+  }
+
+  final_response_sent_ = is_final;
   response_callback_(new ChatCompletionResult(
       http_code_, std::move(response_buffer_), is_final, is_null, request_id_));
 }
@@ -172,9 +180,11 @@ ChatCompletionClient::AsyncInfer(
     request->timer_.CaptureTimestamp(
         triton::client::RequestTimers::Kind::REQUEST_END);
     UpdateInferStat(request->timer_);
-    if (!request->is_stream_) {
-      request->SendResponse(true /* is_final */, false /* is_null */);
-    }
+
+    // Send Response checks if a final
+    // response has already been sent
+    // (in the case of seeing [DONE] in streaming case)
+    request->SendResponse(true /* is_final */, false /* is_null */);
   };
   std::unique_ptr<HttpRequest> request(new ChatCompletionRequest(
       std::move(completion_callback), std::move(callback), request_id,

diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
@@ -127,6 +127,7 @@ class ChatCompletionRequest : public HttpRequest {
   // The timers for infer request.
   triton::client::RequestTimers timer_;
   const std::string request_id_;
+  bool final_response_sent_{false};
 };
 
 class ChatCompletionClient : public HttpClient {

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py
@@ -41,6 +41,7 @@ class PromptSource(Enum):
 class OutputFormat(Enum):
     OPENAI_CHAT_COMPLETIONS = auto()
     OPENAI_COMPLETIONS = auto()
+    TRITON_GENERATE = auto()
     TENSORRTLLM = auto()
     VLLM = auto()
 
@@ -231,7 +232,6 @@ def _get_input_dataset_from_url(
         url = cls._resolve_url(dataset_name)
         configured_url = cls._create_configured_url(url, starting_index, length)
         dataset = cls._download_dataset(configured_url)
-
         return dataset
 
     @classmethod
@@ -364,7 +364,18 @@ def _convert_generic_json_to_output_format(
         model_name: list = [],
         model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN,
     ) -> Dict:
-        if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
+        if output_format == OutputFormat.TRITON_GENERATE:
+            output_json = cls._convert_generic_json_to_generate_format(
+                generic_dataset,
+                add_model_name,
+                add_stream,
+                extra_inputs,
+                output_tokens_mean,
+                output_tokens_stddev,
+                output_tokens_deterministic,
+                model_name,
+            )
+        elif output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
             output_json = cls._convert_generic_json_to_openai_chat_completions_format(
                 generic_dataset,
                 add_model_name,
@@ -454,6 +465,41 @@ def _convert_generic_json_to_openai_chat_completions_format(
 
         return pa_json
 
+    @classmethod
+    def _convert_generic_json_to_generate_format(
+        cls,
+        dataset_json: Dict,
+        add_model_name: bool,
+        add_stream: bool,
+        extra_inputs: Dict,
+        output_tokens_mean: int,
+        output_tokens_stddev: int,
+        output_tokens_deterministic: bool,
+        model_name: list = [],
+        model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN,
+    ) -> Dict:
+        (
+            system_role_headers,
+            user_role_headers,
+            text_input_headers,
+        ) = cls._determine_json_feature_roles(dataset_json)
+
+        pa_json = cls._populate_triton_generate_output_json(
+            dataset_json,
+            system_role_headers,
+            user_role_headers,
+            text_input_headers,
+            add_model_name,
+            add_stream,
+            extra_inputs,
+            output_tokens_mean,
+            output_tokens_stddev,
+            output_tokens_deterministic,
+            model_name,
+        )
+
+        return pa_json
+
     @classmethod
     def _convert_generic_json_to_openai_completions_format(
         cls,
@@ -653,6 +699,52 @@ def _populate_openai_chat_completions_output_json(
 
         return pa_json
 
+    @classmethod
+    def _populate_triton_generate_output_json(
+        cls,
+        dataset: Dict,
+        system_role_headers: List[str],
+        user_role_headers: List[str],
+        text_input_headers: List[str],
+        add_model_name: bool,
+        add_stream: bool,
+        extra_inputs: Dict,
+        output_tokens_mean: int,
+        output_tokens_stddev: int,
+        output_tokens_deterministic: bool,
+        model_name: list = [],
+        model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN,
+    ) -> Dict:
+        pa_json: dict = {"data": [{"payload": [{}]} for _ in dataset["rows"]]}
+
+        for index, entry in enumerate(dataset["rows"]):
+            for header, content in entry.items():
+                new_text_input = cls._create_new_text_input(
+                    header,
+                    system_role_headers,
+                    user_role_headers,
+                    text_input_headers,
+                    content,
+                )
+                pa_json["data"][index]["payload"][0]["text_input"] = new_text_input
+
+            iter_model_name = cls._select_model_name(
+                model_name, index, model_selection_strategy
+            )
+            pa_json = cls._add_optional_tags_to_openai_json(
+                pa_json,
+                index,
+                False,
+                add_stream,
+                extra_inputs,
+                output_tokens_mean,
+                output_tokens_stddev,
+                output_tokens_deterministic,
+                iter_model_name,
+            )
+
+        return pa_json
+
     @classmethod
     def _populate_openai_completions_output_json(
         cls,

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py
@@ -46,6 +46,7 @@ class ResponseFormat(Enum):
     OPENAI_CHAT_COMPLETIONS = auto()
     OPENAI_COMPLETIONS = auto()
     TRITON = auto()
+    TRITON_GENERATE = auto()
 
 
 class Metrics:
@@ -446,6 +447,8 @@ def _get_profile_metadata(self, data: dict) -> None:
                 self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS
             elif data["endpoint"] == "v1/completions":
                 self._response_format = ResponseFormat.OPENAI_COMPLETIONS
+            elif "generate" in data["endpoint"]:
+                self._response_format = ResponseFormat.TRITON_GENERATE
             else:
                 # TPA-66: add PA metadata to handle this case
                 # When endpoint field is either empty or custom endpoint, fall
@@ -662,6 +665,8 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]:
             input_text = payload["messages"][0]["content"]
         elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS:
             input_text = payload["prompt"]
+        elif self._response_format == ResponseFormat.TRITON_GENERATE:
+            input_text = payload["text_input"]
         else:
             raise ValueError(
                 "Failed to parse OpenAI request input in profile export file."
@@ -689,7 +694,10 @@ def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]]
         """Tokenize the OpenAI response output texts."""
         output_texts = []
         for output in res_outputs:
-            text = self._extract_openai_text_output(output["response"])
+            if self._response_format == ResponseFormat.TRITON_GENERATE:
+                text = self._extract_generate_text_output(output["response"])
+            else:
+                text = self._extract_openai_text_output(output["response"])
             output_texts.append(text)
         return self._run_tokenizer(output_texts)
 
@@ -702,6 +710,15 @@ def _run_tokenizer(self, output_texts: List[str]) -> List[List[int]]:
         encodings = self._tokenizer(output_texts)
         return [out[1:] for out in encodings.data["input_ids"]]
 
+    def _extract_generate_text_output(self, response: str) -> str:
+        response = remove_sse_prefix(response)
+
+        if response == "":
+            return response
+
+        data = json.loads(response)
+        return data["text_output"]
+
     def _extract_openai_text_output(self, response: str) -> str:
         """Extracts text/content of the OpenAI response object."""
         response = remove_sse_prefix(response)
@@ -731,7 +748,10 @@ def _extract_openai_text_output(self, response: str) -> str:
 
     def _is_openai_empty_response(self, response: str) -> bool:
         """Returns true if the response is an openai response with no content (or empty content)"""
-        text = self._extract_openai_text_output(response)
+        if self._response_format == ResponseFormat.TRITON_GENERATE:
+            text = self._extract_generate_text_output(response)
+        else:
+            text = self._extract_openai_text_output(response)
         if text:
             return False
         return True
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
@@ -51,7 +51,12 @@
 
 logger = logging.getLogger(__name__)
 
-_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions"}
+_endpoint_type_map = {
+    "chat": "v1/chat/completions",
+    "completions": "v1/completions",
+    "generate": "v2/models/{MODEL_NAME}/generate",
+    "kserve": "v2/models/{MODEL_NAME}/infer",
+}
 
 
 def _check_model_args(
@@ -98,31 +103,31 @@ def _check_conditional_args(
     Check for conditional args and raise an error if they are not set.
     """
 
-    # Endpoint and output format checks
-    if args.service_kind == "openai":
-        if args.endpoint_type is None:
-            parser.error(
-                "The --endpoint-type option is required when using the 'openai' service-kind."
-            )
-        else:
-            if args.endpoint_type == "chat":
-                args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS
-            elif args.endpoint_type == "completions":
-                args.output_format = OutputFormat.OPENAI_COMPLETIONS
-
-            if args.endpoint is not None:
-                args.endpoint = args.endpoint.lstrip(" /")
-            else:
-                args.endpoint = _endpoint_type_map[args.endpoint_type]
-    elif args.endpoint_type is not None:
-        parser.error(
-            "The --endpoint-type option should only be used when using the 'openai' service-kind."
-        )
-
-    if args.service_kind == "triton":
+    if args.endpoint_type == "chat":
+        args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS
+        args.service_kind = "openai"
+    elif args.endpoint_type == "completions":
+        args.output_format = OutputFormat.OPENAI_COMPLETIONS
+        args.service_kind = "openai"
+    elif args.endpoint_type == "generate":
+        args.output_format = OutputFormat.TRITON_GENERATE
+        args.service_kind = "openai"
+    elif args.endpoint_type == "kserve":
+        args.service_kind = "triton"
         args = _convert_str_to_enum_entry(args, "backend", OutputFormat)
         args.output_format = args.backend
 
+    if args.endpoint is not None:
+        args.endpoint = args.endpoint.lstrip(" /")
+    else:
+        if args.model:
+            model_name = args.model[0]
+        else:
+            model_name = ""
+        args.endpoint = _endpoint_type_map[args.endpoint_type].format(
+            MODEL_NAME=model_name
+        )
+
     # Output token distribution checks
     if args.output_tokens_mean == LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN:
         if args.output_tokens_stddev != LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV:
@@ -137,7 +142,7 @@ def _check_conditional_args(
     if args.service_kind != "triton":
         if args.output_tokens_mean_deterministic:
             parser.error(
-                "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind."
+                "The --output-tokens-mean-deterministic option is only supported with the kserve endpoint type."
             )
 
     return args
@@ -272,7 +277,7 @@ def _add_input_args(parser):
         help=f"When using --output-tokens-mean, this flag can be set to "
         "improve precision by setting the minimum number of tokens "
         "equal to the requested number of tokens. This is currently "
-        "supported with the Triton service-kind. "
+        "supported with the kserve endpoint type. "
         "Note that there is still some variability in the requested number "
         "of output tokens, but GenAi-Perf attempts its best effort with your "
         "model to get the right number of output tokens. ",
@@ -380,10 +385,10 @@ def _add_endpoint_args(parser):
     endpoint_group.add_argument(
         "--backend",
         type=str,
-        choices=utils.get_enum_names(OutputFormat)[2:],
+        choices=["tensorrtllm", "vllm"],
         default="tensorrtllm",
         required=False,
-        help=f'When using the "triton" service-kind, '
+        help=f'When using the "kserve" endpoint type, '
         "this is the backend of the model. "
         "For the TENSORRT-LLM backend, you currently must set "
         "'exclude_input_in_output' to true in the model config to "
@@ -400,21 +405,10 @@ def _add_endpoint_args(parser):
     endpoint_group.add_argument(
         "--endpoint-type",
         type=str,
-        choices=["chat", "completions"],
-        required=False,
-        help=f"The endpoint-type to send requests to on the "
-        'server. This is only used with the "openai" service-kind.',
-    )
-
-    endpoint_group.add_argument(
-        "--service-kind",
-        type=str,
-        choices=["triton", "openai"],
-        default="triton",
+        choices=["chat", "completions", "generate", "kserve"],
+        default="kserve",
         required=False,
-        help="The kind of service perf_analyzer will "
-        'generate load for. In order to use "openai", '
-        "you must specify an api via --endpoint-type.",
+        help=f"The endpoint-type for requests. Inputs will be formatted according to endpoint-type.",
     )
 
     endpoint_group.add_argument(