From 9e612b8f96d32c1dfaab9d877e477c224da7048e Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 20 Dec 2024 21:52:32 +0400 Subject: [PATCH] =?UTF-8?q?llm=5Fpipeline=5Fstatic:=20flush=20streamer=20a?= =?UTF-8?q?fter=20generation=20loop=20is=20complete=E2=80=A6=20(#1418)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … (#1350) Without these changes, chat_sample with NPU device produces responses that are clipped by 4 characters: ![image](https://github.com/user-attachments/assets/e841bf36-948b-4899-820f-6b52460076e9) Flushing the streamer (as [get_lm_encoded_results()](https://github.com/openvinotoolkit/openvino.genai/blob/71ea7aae7357fa0bb21a5161ef078bef8ce7af7c/src/cpp/src/lm_encoding.cpp#L224) does in non-static LLM cases) seems to resolve the issue. Signed-off-by: Ryan Metcalfe Co-authored-by: Ryan Metcalfe <107415876+RyanMetcalfeInt8@users.noreply.github.com> --- src/cpp/src/llm_pipeline_static.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 090aed9650..42430f70a6 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -1102,6 +1102,11 @@ EncodedResults StaticLLMPipeline::generate( m_kvcache_request.get_tensor(output_name).copy_to(kvcache_in_slice); } } + + if (streamer_ptr) { + streamer_ptr->end(); + } + auto stop_time = std::chrono::steady_clock::now(); // If is called without tokenization then that stat will not be reported. auto& metrics = results.perf_metrics;