Skip to content

Commit

Permalink
[llm_bench] add infer latency metrics for genai (#1391)
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova authored Dec 16, 2024
1 parent c244054 commit 1f149a6
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 2 deletions.
2 changes: 1 addition & 1 deletion tools/llm_bench/task/speech_to_text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list):
- np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
).tolist()
tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
tm_infer_list = None
tm_infer_list = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist()
result_text = result_text.texts[0]
else:
start = time.perf_counter()
Expand Down
3 changes: 2 additions & 1 deletion tools/llm_bench/task/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ def token_printer():
np.mean(perf_metrics.raw_metrics.tokenization_durations) / 1000,
np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000
)
inference_durations = np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000
iter_data = gen_output_data.gen_iterate_data(
iter_idx=num,
in_size=num_input_tokens * args['batch_size'],
Expand All @@ -313,7 +314,7 @@ def token_printer():
num,
iter_data,
tm_list.tolist(),
None,
inference_durations.tolist(),
warm_up=(num == 0),
max_rss_mem=max_rss_mem_consumption,
max_shared_mem=max_shared_mem_consumption,
Expand Down

0 comments on commit 1f149a6

Please sign in to comment.