diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index 0a880c4a42..88c5466fa4 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -27,8 +27,6 @@ using MicroSeconds = std::chrono::duration>; * @param m_batch_sizes Batch sizes for each generate call. * @param m_durations Total durations for each generate call in microseconds. * @param m_inference_durations Total inference duration for each generate call in microseconds. - * @param num_generated_tokens Total number of tokens generated. - * @param num_input_tokens Total number of tokens in the input prompt. */ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { std::vector generate_durations; diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 8fab02bc47..68d4a44fde 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -992,32 +992,32 @@ class RawPerfMetrics: Structure with raw performance metrics for each generation before any statistics are calculated. - :param generate_durations: Durations for each generate call in microseconds. - :type generate_durations: List[MicroSeconds] + :param generate_durations: Durations for each generate call in milliseconds. + :type generate_durations: List[float] - :param tokenization_durations: Durations for the tokenization process in microseconds. - :type tokenization_durations: List[MicroSeconds] + :param tokenization_durations: Durations for the tokenization process in milliseconds. + :type tokenization_durations: List[float] - :param detokenization_durations: Durations for the detokenization process in microseconds. - :type detokenization_durations: List[MicroSeconds] + :param detokenization_durations: Durations for the detokenization process in milliseconds. + :type detokenization_durations: List[float] - :param m_times_to_first_token: Times to the first token for each call in microseconds. - :type m_times_to_first_token: List[MicroSeconds] + :param m_times_to_first_token: Times to the first token for each call in milliseconds. + :type m_times_to_first_token: List[float] :param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds. - :type m_new_token_times: List[MilliSeconds] + :type m_new_token_times: List[double] + + :param token_infer_durations : Inference time for each token in milliseconds. + :type batch_sizes: List[float] :param m_batch_sizes: Batch sizes for each generate call. :type m_batch_sizes: List[int] - :param m_durations: Total durations for each generate call in microseconds. - :type m_durations: List[MicroSeconds] - - :param num_generated_tokens: Total number of tokens generated. - :type num_generated_tokens: int + :param m_durations: Total durations for each generate call in milliseconds. + :type m_durations: List[float] - :param num_input_tokens: Total number of tokens in the input prompt. - :type num_input_tokens: int + :param inference_durations : Total inference duration for each generate call in milliseconds. + :type batch_sizes: List[float] """ def __init__(self) -> None: ... @@ -1028,6 +1028,9 @@ class RawPerfMetrics: def generate_durations(self) -> list[float]: ... @property + def inference_durations(self) -> list[float]: + ... + @property def m_batch_sizes(self) -> list[int]: ... @property @@ -1040,6 +1043,9 @@ class RawPerfMetrics: def m_times_to_first_token(self) -> list[float]: ... @property + def token_infer_durations(self) -> list[float]: + ... + @property def tokenization_durations(self) -> list[float]: ... class Scheduler: diff --git a/src/python/py_perf_metrics.cpp b/src/python/py_perf_metrics.cpp index 1d37784e27..8b33c92a05 100644 --- a/src/python/py_perf_metrics.cpp +++ b/src/python/py_perf_metrics.cpp @@ -20,32 +20,32 @@ namespace { auto raw_perf_metrics_docstring = R"( Structure with raw performance metrics for each generation before any statistics are calculated. - :param generate_durations: Durations for each generate call in microseconds. - :type generate_durations: List[MicroSeconds] + :param generate_durations: Durations for each generate call in milliseconds. + :type generate_durations: List[float] - :param tokenization_durations: Durations for the tokenization process in microseconds. - :type tokenization_durations: List[MicroSeconds] + :param tokenization_durations: Durations for the tokenization process in milliseconds. + :type tokenization_durations: List[float] - :param detokenization_durations: Durations for the detokenization process in microseconds. - :type detokenization_durations: List[MicroSeconds] + :param detokenization_durations: Durations for the detokenization process in milliseconds. + :type detokenization_durations: List[float] - :param m_times_to_first_token: Times to the first token for each call in microseconds. - :type m_times_to_first_token: List[MicroSeconds] + :param m_times_to_first_token: Times to the first token for each call in milliseconds. + :type m_times_to_first_token: List[float] :param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds. - :type m_new_token_times: List[MilliSeconds] + :type m_new_token_times: List[double] + + :param token_infer_durations : Inference time for each token in milliseconds. + :type batch_sizes: List[float] :param m_batch_sizes: Batch sizes for each generate call. :type m_batch_sizes: List[int] - :param m_durations: Total durations for each generate call in microseconds. - :type m_durations: List[MicroSeconds] - - :param num_generated_tokens: Total number of tokens generated. - :type num_generated_tokens: int + :param m_durations: Total durations for each generate call in milliseconds. + :type m_durations: List[float] - :param num_input_tokens: Total number of tokens in the input prompt. - :type num_input_tokens: int + :param inference_durations : Total inference duration for each generate call in milliseconds. + :type batch_sizes: List[float] )"; auto perf_metrics_docstring = R"( @@ -145,10 +145,16 @@ void init_perf_metrics(py::module_& m) { .def_property_readonly("m_new_token_times", [](const RawPerfMetrics &rw) { return timestamp_to_ms(rw, &RawPerfMetrics::m_new_token_times); }) + .def_property_readonly("token_infer_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_token_infer_durations); + }) + .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes) .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) { return get_ms(rw, &RawPerfMetrics::m_durations); }) - .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes); + .def_property_readonly("inference_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_inference_durations); + }); py::class_(m, "MeanStdPair") .def(py::init<>())