make token_inference_duration visible from py api (#1298)

- @eaidova needs this metrics to be available form Python. - Also fixed a couple of docstrings along the way.
openvinotoolkit · Dec 4, 2024 · d681f1b · d681f1b
2 parents 48b2f6c + 728729c
commit d681f1b
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 35 deletions.
diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -27,8 +27,6 @@ using MicroSeconds = std::chrono::duration<float, std::ratio<1, 1000000>>;
  * @param m_batch_sizes Batch sizes for each generate call.
  * @param m_durations Total durations for each generate call in microseconds.
  * @param m_inference_durations Total inference duration for each generate call in microseconds.
- * @param num_generated_tokens Total number of tokens generated.
- * @param num_input_tokens Total number of tokens in the input prompt.
  */
 struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
     std::vector<MicroSeconds> generate_durations;

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -992,32 +992,32 @@ class RawPerfMetrics:
     
         Structure with raw performance metrics for each generation before any statistics are calculated.
     
-        :param generate_durations: Durations for each generate call in microseconds.
-        :type generate_durations: List[MicroSeconds]
+        :param generate_durations: Durations for each generate call in milliseconds.
+        :type generate_durations: List[float]
     
-        :param tokenization_durations: Durations for the tokenization process in microseconds.
-        :type tokenization_durations: List[MicroSeconds]
+        :param tokenization_durations: Durations for the tokenization process in milliseconds.
+        :type tokenization_durations: List[float]
     
-        :param detokenization_durations: Durations for the detokenization process in microseconds.
-        :type detokenization_durations: List[MicroSeconds]
+        :param detokenization_durations: Durations for the detokenization process in milliseconds.
+        :type detokenization_durations: List[float]
     
-        :param m_times_to_first_token: Times to the first token for each call in microseconds.
-        :type m_times_to_first_token: List[MicroSeconds]
+        :param m_times_to_first_token: Times to the first token for each call in milliseconds.
+        :type m_times_to_first_token: List[float]
     
         :param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds.
-        :type m_new_token_times: List[MilliSeconds]
+        :type m_new_token_times: List[double]
+    
+        :param token_infer_durations : Inference time for each token in milliseconds.
+        :type batch_sizes: List[float]
     
         :param m_batch_sizes: Batch sizes for each generate call.
         :type m_batch_sizes: List[int]
     
-        :param m_durations: Total durations for each generate call in microseconds.
-        :type m_durations: List[MicroSeconds]
-    
-        :param num_generated_tokens: Total number of tokens generated.
-        :type num_generated_tokens: int
+        :param m_durations: Total durations for each generate call in milliseconds.
+        :type m_durations: List[float]
     
-        :param num_input_tokens: Total number of tokens in the input prompt.
-        :type num_input_tokens: int
+        :param inference_durations : Total inference duration for each generate call in milliseconds.
+        :type batch_sizes: List[float]
     """
     def __init__(self) -> None:
         ...
@@ -1028,6 +1028,9 @@ class RawPerfMetrics:
     def generate_durations(self) -> list[float]:
         ...
     @property
+    def inference_durations(self) -> list[float]:
+        ...
+    @property
     def m_batch_sizes(self) -> list[int]:
         ...
     @property
@@ -1040,6 +1043,9 @@ class RawPerfMetrics:
     def m_times_to_first_token(self) -> list[float]:
         ...
     @property
+    def token_infer_durations(self) -> list[float]:
+        ...
+    @property
     def tokenization_durations(self) -> list[float]:
         ...
 class Scheduler:

diff --git a/src/python/py_perf_metrics.cpp b/src/python/py_perf_metrics.cpp
@@ -20,32 +20,32 @@ namespace {
 auto raw_perf_metrics_docstring = R"(
     Structure with raw performance metrics for each generation before any statistics are calculated.
 
-    :param generate_durations: Durations for each generate call in microseconds.
-    :type generate_durations: List[MicroSeconds]
+    :param generate_durations: Durations for each generate call in milliseconds.
+    :type generate_durations: List[float]
 
-    :param tokenization_durations: Durations for the tokenization process in microseconds.
-    :type tokenization_durations: List[MicroSeconds]
+    :param tokenization_durations: Durations for the tokenization process in milliseconds.
+    :type tokenization_durations: List[float]
 
-    :param detokenization_durations: Durations for the detokenization process in microseconds.
-    :type detokenization_durations: List[MicroSeconds]
+    :param detokenization_durations: Durations for the detokenization process in milliseconds.
+    :type detokenization_durations: List[float]
 
-    :param m_times_to_first_token: Times to the first token for each call in microseconds.
-    :type m_times_to_first_token: List[MicroSeconds]
+    :param m_times_to_first_token: Times to the first token for each call in milliseconds.
+    :type m_times_to_first_token: List[float]
 
     :param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds.
-    :type m_new_token_times: List[MilliSeconds]
+    :type m_new_token_times: List[double]
+
+    :param token_infer_durations : Inference time for each token in milliseconds.
+    :type batch_sizes: List[float]
 
     :param m_batch_sizes: Batch sizes for each generate call.
     :type m_batch_sizes: List[int]
 
-    :param m_durations: Total durations for each generate call in microseconds.
-    :type m_durations: List[MicroSeconds]
-
-    :param num_generated_tokens: Total number of tokens generated.
-    :type num_generated_tokens: int
+    :param m_durations: Total durations for each generate call in milliseconds.
+    :type m_durations: List[float]
 
-    :param num_input_tokens: Total number of tokens in the input prompt.
-    :type num_input_tokens: int
+    :param inference_durations : Total inference duration for each generate call in milliseconds.
+    :type batch_sizes: List[float]
 )";
 
 auto perf_metrics_docstring = R"(
@@ -145,10 +145,16 @@ void init_perf_metrics(py::module_& m) {
         .def_property_readonly("m_new_token_times", [](const RawPerfMetrics &rw) {
             return timestamp_to_ms(rw, &RawPerfMetrics::m_new_token_times);
         })
+        .def_property_readonly("token_infer_durations", [](const RawPerfMetrics &rw) {
+            return get_ms(rw, &RawPerfMetrics::m_token_infer_durations);
+        })
+        .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes)
         .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) {
             return get_ms(rw, &RawPerfMetrics::m_durations);
         })
-        .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes);
+        .def_property_readonly("inference_durations", [](const RawPerfMetrics &rw) {
+            return get_ms(rw, &RawPerfMetrics::m_inference_durations);
+        });
 
     py::class_<MeanStdPair>(m, "MeanStdPair")
         .def(py::init<>())