Skip to content

Commit

Permalink
make token_inference_duration visible from py api (#1298)
Browse files Browse the repository at this point in the history
- @eaidova needs this metrics to be available form Python.
- Also fixed a couple of docstrings along the way.
  • Loading branch information
pavel-esir authored Dec 4, 2024
2 parents 48b2f6c + 728729c commit d681f1b
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 35 deletions.
2 changes: 0 additions & 2 deletions src/cpp/include/openvino/genai/perf_metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ using MicroSeconds = std::chrono::duration<float, std::ratio<1, 1000000>>;
* @param m_batch_sizes Batch sizes for each generate call.
* @param m_durations Total durations for each generate call in microseconds.
* @param m_inference_durations Total inference duration for each generate call in microseconds.
* @param num_generated_tokens Total number of tokens generated.
* @param num_input_tokens Total number of tokens in the input prompt.
*/
struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
std::vector<MicroSeconds> generate_durations;
Expand Down
38 changes: 22 additions & 16 deletions src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -992,32 +992,32 @@ class RawPerfMetrics:
Structure with raw performance metrics for each generation before any statistics are calculated.
:param generate_durations: Durations for each generate call in microseconds.
:type generate_durations: List[MicroSeconds]
:param generate_durations: Durations for each generate call in milliseconds.
:type generate_durations: List[float]
:param tokenization_durations: Durations for the tokenization process in microseconds.
:type tokenization_durations: List[MicroSeconds]
:param tokenization_durations: Durations for the tokenization process in milliseconds.
:type tokenization_durations: List[float]
:param detokenization_durations: Durations for the detokenization process in microseconds.
:type detokenization_durations: List[MicroSeconds]
:param detokenization_durations: Durations for the detokenization process in milliseconds.
:type detokenization_durations: List[float]
:param m_times_to_first_token: Times to the first token for each call in microseconds.
:type m_times_to_first_token: List[MicroSeconds]
:param m_times_to_first_token: Times to the first token for each call in milliseconds.
:type m_times_to_first_token: List[float]
:param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds.
:type m_new_token_times: List[MilliSeconds]
:type m_new_token_times: List[double]
:param token_infer_durations : Inference time for each token in milliseconds.
:type batch_sizes: List[float]
:param m_batch_sizes: Batch sizes for each generate call.
:type m_batch_sizes: List[int]
:param m_durations: Total durations for each generate call in microseconds.
:type m_durations: List[MicroSeconds]
:param num_generated_tokens: Total number of tokens generated.
:type num_generated_tokens: int
:param m_durations: Total durations for each generate call in milliseconds.
:type m_durations: List[float]
:param num_input_tokens: Total number of tokens in the input prompt.
:type num_input_tokens: int
:param inference_durations : Total inference duration for each generate call in milliseconds.
:type batch_sizes: List[float]
"""
def __init__(self) -> None:
...
Expand All @@ -1028,6 +1028,9 @@ class RawPerfMetrics:
def generate_durations(self) -> list[float]:
...
@property
def inference_durations(self) -> list[float]:
...
@property
def m_batch_sizes(self) -> list[int]:
...
@property
Expand All @@ -1040,6 +1043,9 @@ class RawPerfMetrics:
def m_times_to_first_token(self) -> list[float]:
...
@property
def token_infer_durations(self) -> list[float]:
...
@property
def tokenization_durations(self) -> list[float]:
...
class Scheduler:
Expand Down
40 changes: 23 additions & 17 deletions src/python/py_perf_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,32 +20,32 @@ namespace {
auto raw_perf_metrics_docstring = R"(
Structure with raw performance metrics for each generation before any statistics are calculated.
:param generate_durations: Durations for each generate call in microseconds.
:type generate_durations: List[MicroSeconds]
:param generate_durations: Durations for each generate call in milliseconds.
:type generate_durations: List[float]
:param tokenization_durations: Durations for the tokenization process in microseconds.
:type tokenization_durations: List[MicroSeconds]
:param tokenization_durations: Durations for the tokenization process in milliseconds.
:type tokenization_durations: List[float]
:param detokenization_durations: Durations for the detokenization process in microseconds.
:type detokenization_durations: List[MicroSeconds]
:param detokenization_durations: Durations for the detokenization process in milliseconds.
:type detokenization_durations: List[float]
:param m_times_to_first_token: Times to the first token for each call in microseconds.
:type m_times_to_first_token: List[MicroSeconds]
:param m_times_to_first_token: Times to the first token for each call in milliseconds.
:type m_times_to_first_token: List[float]
:param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds.
:type m_new_token_times: List[MilliSeconds]
:type m_new_token_times: List[double]
:param token_infer_durations : Inference time for each token in milliseconds.
:type batch_sizes: List[float]
:param m_batch_sizes: Batch sizes for each generate call.
:type m_batch_sizes: List[int]
:param m_durations: Total durations for each generate call in microseconds.
:type m_durations: List[MicroSeconds]
:param num_generated_tokens: Total number of tokens generated.
:type num_generated_tokens: int
:param m_durations: Total durations for each generate call in milliseconds.
:type m_durations: List[float]
:param num_input_tokens: Total number of tokens in the input prompt.
:type num_input_tokens: int
:param inference_durations : Total inference duration for each generate call in milliseconds.
:type batch_sizes: List[float]
)";

auto perf_metrics_docstring = R"(
Expand Down Expand Up @@ -145,10 +145,16 @@ void init_perf_metrics(py::module_& m) {
.def_property_readonly("m_new_token_times", [](const RawPerfMetrics &rw) {
return timestamp_to_ms(rw, &RawPerfMetrics::m_new_token_times);
})
.def_property_readonly("token_infer_durations", [](const RawPerfMetrics &rw) {
return get_ms(rw, &RawPerfMetrics::m_token_infer_durations);
})
.def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes)
.def_property_readonly("m_durations", [](const RawPerfMetrics &rw) {
return get_ms(rw, &RawPerfMetrics::m_durations);
})
.def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes);
.def_property_readonly("inference_durations", [](const RawPerfMetrics &rw) {
return get_ms(rw, &RawPerfMetrics::m_inference_durations);
});

py::class_<MeanStdPair>(m, "MeanStdPair")
.def(py::init<>())
Expand Down

0 comments on commit d681f1b

Please sign in to comment.