From 499096a22d73e4365695f0d45077cd163aa45a1e Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 18 Dec 2024 20:27:04 +0400 Subject: [PATCH 1/9] [llm bench]: add infer latency for genai (#1397) CVS-158466 port from 2024.6 to master https://github.com/openvinotoolkit/openvino.genai/pull/1391 --- tools/llm_bench/task/speech_to_text_generation.py | 2 +- tools/llm_bench/task/text_generation.py | 3 ++- tools/llm_bench/task/visual_language_generation.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py index f1e7ac54a0..15a47a8b6a 100644 --- a/tools/llm_bench/task/speech_to_text_generation.py +++ b/tools/llm_bench/task/speech_to_text_generation.py @@ -57,7 +57,7 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list): - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) ).tolist() tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist() - tm_infer_list = None + tm_infer_list = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist() result_text = result_text.texts[0] else: start = time.perf_counter() diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 3f5b5ed301..485de94996 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -302,6 +302,7 @@ def token_printer(): ).tolist() tm_list = np.array([first_token_time] + second_tokens_durations) / 1000 + inference_durations = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist() log.debug('latency of all tokens:') [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] iter_data = gen_output_data.gen_iterate_data( @@ -323,7 +324,7 @@ def token_printer(): num, iter_data, tm_list.tolist(), - None, + inference_durations.tolist(), warm_up=(num == 0), max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py index c4144366b4..068ae0cf60 100644 --- a/tools/llm_bench/task/visual_language_generation.py +++ b/tools/llm_bench/task/visual_language_generation.py @@ -268,11 +268,12 @@ def run_visual_language_generation_genai( mm_embeddings_preparation_time=perf_metrics.get_prepare_embeddings_duration().mean ) iter_data_list.append(iter_data) + inference_durations = np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000 metrics_print.print_metrics( num, iter_data, tm_list.tolist(), - None, + inference_durations.tolist(), warm_up=(num == 0), max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, From 1542c60f9a07de77eb3485b3589b309d3c5b5347 Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Thu, 19 Dec 2024 10:39:36 +0300 Subject: [PATCH 2/9] Removed generator patching (#1408) --- tools/who_what_benchmark/whowhatbench/wwb.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 026a6cc69b..04813f5fd8 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -1,7 +1,3 @@ -from .utils import patch_diffusers - -patch_diffusers() - import argparse import difflib import numpy as np From 7a02d2bca6cf29dfe8fdcd796fca0d33ef275426 Mon Sep 17 00:00:00 2001 From: Anna Likholat Date: Thu, 19 Dec 2024 08:41:00 +0100 Subject: [PATCH 3/9] [ImageGeneration] EulerAncestralDiscreteScheduler (#1407) ![image](https://github.com/user-attachments/assets/6b688510-50d9-4f32-b80d-cb8cfa0b4b79) CVS-156803 CVS-158965 --------- Co-authored-by: Ilya Lavrenov --- .../genai/image_generation/scheduler.hpp | 3 +- .../schedulers/euler_ancestral_discrete.cpp | 261 ++++++++++++++++++ .../schedulers/euler_ancestral_discrete.hpp | 61 ++++ .../image_generation/schedulers/scheduler.cpp | 3 + .../src/image_generation/schedulers/types.cpp | 2 + src/docs/SUPPORTED_MODELS.md | 1 + .../openvino_genai/py_openvino_genai.pyi | 5 +- src/python/py_image_generation_pipelines.cpp | 3 +- tools/llm_bench/llm_bench_utils/ov_utils.py | 2 +- 9 files changed, 337 insertions(+), 4 deletions(-) create mode 100644 src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp create mode 100644 src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp diff --git a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp index 21c266aa50..25c5e07a2f 100644 --- a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp +++ b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp @@ -19,7 +19,8 @@ class OPENVINO_GENAI_EXPORTS Scheduler { DDIM, EULER_DISCRETE, FLOW_MATCH_EULER_DISCRETE, - PNDM + PNDM, + EULER_ANCESTRAL_DISCRETE }; static std::shared_ptr from_config(const std::filesystem::path& scheduler_config_path, diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp new file mode 100644 index 0000000000..a63a073cfc --- /dev/null +++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp @@ -0,0 +1,261 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include + +#include "image_generation/schedulers/euler_ancestral_discrete.hpp" +#include "image_generation/numpy_utils.hpp" + +namespace ov { +namespace genai { + +EulerAncestralDiscreteScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) { + std::ifstream file(scheduler_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "num_train_timesteps", num_train_timesteps); + read_json_param(data, "beta_start", beta_start); + read_json_param(data, "beta_end", beta_end); + read_json_param(data, "beta_schedule", beta_schedule); + read_json_param(data, "trained_betas", trained_betas); + read_json_param(data, "steps_offset", steps_offset); + read_json_param(data, "prediction_type", prediction_type); + read_json_param(data, "timestep_spacing", timestep_spacing); + read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); +} + +EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path) + : EulerAncestralDiscreteScheduler(Config(scheduler_config_path)) { +} + +EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const Config& scheduler_config): m_config(scheduler_config) { + std::vector alphas, betas; + + using numpy_utils::linspace; + + if (!m_config.trained_betas.empty()) { + betas = m_config.trained_betas; + } else if (m_config.beta_schedule == BetaSchedule::LINEAR) { + betas = linspace(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps); + } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) { + float start = std::sqrt(m_config.beta_start); + float end = std::sqrt(m_config.beta_end); + betas = linspace(start, end, m_config.num_train_timesteps); + std::for_each(betas.begin(), betas.end(), [](float& x) { + x *= x; + }); + // TODO: else if beta_schedule == "squaredcos_cap_v2" + } else { + OPENVINO_THROW( + "'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types"); + } + + if (m_config.rescale_betas_zero_snr) { + using numpy_utils::rescale_zero_terminal_snr; + rescale_zero_terminal_snr(betas); + } + + std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [](float b) { + return 1.0f - b; + }); + + for (size_t i = 1; i <= alphas.size(); ++i) { + float alpha_cumprod = + std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies{}); + m_alphas_cumprod.push_back(alpha_cumprod); + } + + if (m_config.rescale_betas_zero_snr) { + m_alphas_cumprod.back() = std::pow(2, -24); + } + + for (auto it = m_alphas_cumprod.rbegin(); it != m_alphas_cumprod.rend(); ++it) { + float sigma = std::pow(((1 - (*it)) / (*it)), 0.5); + m_sigmas.push_back(sigma); + } + m_sigmas.push_back(0); + + // setable values + auto linspaced = + linspace(0.0f, static_cast(m_config.num_train_timesteps - 1), m_config.num_train_timesteps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(static_cast(std::round(*it))); + } + m_num_inference_steps = -1; + m_step_index = -1; + m_begin_index = -1; + m_is_scale_input_called = false; +} + +void EulerAncestralDiscreteScheduler::set_timesteps(size_t num_inference_steps, float strength) { + m_timesteps.clear(); + m_sigmas.clear(); + m_step_index = m_begin_index = -1; + m_num_inference_steps = num_inference_steps; + std::vector sigmas; + + switch (m_config.timestep_spacing) { + case TimestepSpacing::LINSPACE: { + using numpy_utils::linspace; + float end = static_cast(m_config.num_train_timesteps - 1); + auto linspaced = linspace(0.0f, end, num_inference_steps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(static_cast(std::round(*it))); + } + break; + } + case TimestepSpacing::LEADING: { + size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps; + for (size_t i = num_inference_steps - 1; i != -1; --i) { + m_timesteps.push_back(i * step_ratio + m_config.steps_offset); + } + break; + } + case TimestepSpacing::TRAILING: { + float step_ratio = static_cast(m_config.num_train_timesteps) / static_cast(m_num_inference_steps); + for (float i = m_config.num_train_timesteps; i > 0; i -= step_ratio) { + m_timesteps.push_back(static_cast(std::round(i)) - 1); + } + break; + } + default: + OPENVINO_THROW("Unsupported value for 'timestep_spacing'"); + } + + for (const float& i : m_alphas_cumprod) { + float sigma = std::pow(((1 - i) / i), 0.5); + sigmas.push_back(sigma); + } + + using numpy_utils::interp; + std::vector x_data_points(sigmas.size()); + std::iota(x_data_points.begin(), x_data_points.end(), 0); + m_sigmas = interp(m_timesteps, x_data_points, sigmas); + m_sigmas.push_back(0.0f); + + // apply 'strength' used in image generation + // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L650 + { + size_t init_timestep = std::min(num_inference_steps * strength, num_inference_steps); + size_t t_start = std::max(num_inference_steps - init_timestep, 0); + // keep original timesteps + m_schedule_timesteps = m_timesteps; + // while return patched ones by 'strength' parameter + m_timesteps = std::vector(m_timesteps.begin() + t_start, m_timesteps.end()); + m_begin_index = t_start; + } +} + +std::map EulerAncestralDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) { + // noise_pred - model_output + // latents - sample + // inference_step + + size_t timestep = m_timesteps[inference_step]; + + if (m_step_index == -1) + m_step_index = m_begin_index; + + float sigma = m_sigmas[m_step_index]; + + float* model_output_data = noise_pred.data(); + float* sample_data = latents.data(); + + ov::Tensor pred_original_sample(noise_pred.get_element_type(), noise_pred.get_shape()); + float* pred_original_sample_data = pred_original_sample.data(); + + switch (m_config.prediction_type) { + case PredictionType::EPSILON: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = sample_data[i] - sigma * model_output_data[i]; + } + break; + case PredictionType::V_PREDICTION: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = model_output_data[i] * (-sigma / std::pow((std::pow(sigma, 2) + 1), 0.5)) + + (sample_data[i] / (std::pow(sigma, 2) + 1)); + } + break; + default: + OPENVINO_THROW("Unsupported value for 'PredictionType': must be one of `epsilon`, or `v_prediction`"); + } + + float sigma_from = m_sigmas[m_step_index]; + float sigma_to = m_sigmas[m_step_index + 1]; + float sigma_up = std::sqrt(std::pow(sigma_to, 2) * (std::pow(sigma_from, 2) - std::pow(sigma_to, 2)) / std::pow(sigma_from, 2)); + float sigma_down = std::sqrt(std::pow(sigma_to, 2) - std::pow(sigma_up, 2)); + float dt = sigma_down - sigma; + + ov::Tensor prev_sample = ov::Tensor(latents.get_element_type(), latents.get_shape()); + float* prev_sample_data = prev_sample.data(); + + ov::Tensor noise = generator->randn_tensor(noise_pred.get_shape()); + const float* noise_data = noise.data(); + + for (size_t i = 0; i < prev_sample.get_size(); ++i) { + float derivative = (sample_data[i] - pred_original_sample_data[i]) / sigma; + prev_sample_data[i] = (sample_data[i] + derivative * dt) + noise_data[i] * sigma_up; + } + + m_step_index++; + + return {{"latent", prev_sample}, {"denoised", pred_original_sample}}; +} + +size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const{ + for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) { + if (timestep == m_schedule_timesteps[i]) { + return i; + } + } + + OPENVINO_THROW("Failed to find index for timestep ", timestep); +} + +void EulerAncestralDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const { + size_t index_for_timestep = _index_for_timestep(latent_timestep); + const float sigma = m_sigmas[index_for_timestep]; + + float * init_latent_data = init_latent.data(); + const float * noise_data = noise.data(); + + for (size_t i = 0; i < init_latent.get_size(); ++i) { + init_latent_data[i] = init_latent_data[i] + sigma * noise_data[i]; + } +} + +std::vector EulerAncestralDiscreteScheduler::get_timesteps() const { + return m_timesteps; +} + +void EulerAncestralDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) { + if (m_step_index == -1) + m_step_index = m_begin_index; + + float sigma = m_sigmas[m_step_index]; + float* sample_data = sample.data(); + for (size_t i = 0; i < sample.get_size(); i++) { + sample_data[i] /= std::pow((std::pow(sigma, 2) + 1), 0.5); + } + m_is_scale_input_called = true; +} + +float EulerAncestralDiscreteScheduler::get_init_noise_sigma() const { + float max_sigma = *std::max_element(m_sigmas.begin(), m_sigmas.end()); + + if (m_config.timestep_spacing == TimestepSpacing::LINSPACE || + m_config.timestep_spacing == TimestepSpacing::TRAILING) { + return max_sigma; + } + + return std::sqrt(std::pow(max_sigma, 2) + 1); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp new file mode 100644 index 0000000000..9d82c9a0a9 --- /dev/null +++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp @@ -0,0 +1,61 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +#include "image_generation/schedulers/types.hpp" +#include "image_generation/schedulers/ischeduler.hpp" + +namespace ov { +namespace genai { + +class EulerAncestralDiscreteScheduler : public IScheduler { +public: + struct Config { + int32_t num_train_timesteps = 1000; + float beta_start = 0.0001f, beta_end = 0.02f; + BetaSchedule beta_schedule = BetaSchedule::LINEAR; + std::vector trained_betas = {}; + size_t steps_offset = 0; + PredictionType prediction_type = PredictionType::EPSILON; + TimestepSpacing timestep_spacing = TimestepSpacing::LEADING; + bool rescale_betas_zero_snr = false; + + Config() = default; + explicit Config(const std::filesystem::path& scheduler_config_path); + }; + + explicit EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path); + explicit EulerAncestralDiscreteScheduler(const Config& scheduler_config); + + void set_timesteps(size_t num_inference_steps, float strength) override; + + std::vector get_timesteps() const override; + + float get_init_noise_sigma() const override; + + void scale_model_input(ov::Tensor sample, size_t inference_step) override; + + std::map step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) override; + + void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override; + +private: + Config m_config; + + std::vector m_alphas_cumprod, m_sigmas; + std::vector m_timesteps, m_schedule_timesteps; + size_t m_num_inference_steps; + + int m_step_index, m_begin_index; + bool m_is_scale_input_called; + + size_t _index_for_timestep(int64_t timestep) const; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/image_generation/schedulers/scheduler.cpp b/src/cpp/src/image_generation/schedulers/scheduler.cpp index f9cd098346..868f6f05cf 100644 --- a/src/cpp/src/image_generation/schedulers/scheduler.cpp +++ b/src/cpp/src/image_generation/schedulers/scheduler.cpp @@ -11,6 +11,7 @@ #include "image_generation/schedulers/euler_discrete.hpp" #include "image_generation/schedulers/flow_match_euler_discrete.hpp" #include "image_generation/schedulers/pndm.hpp" +#include "image_generation/schedulers/euler_ancestral_discrete.hpp" namespace ov { namespace genai { @@ -41,6 +42,8 @@ std::shared_ptr Scheduler::from_config(const std::filesystem::path& s scheduler = std::make_shared(scheduler_config_path); } else if (scheduler_type == Scheduler::Type::PNDM) { scheduler = std::make_shared(scheduler_config_path); + } else if (scheduler_type == Scheduler::Type::EULER_ANCESTRAL_DISCRETE) { + scheduler = std::make_shared(scheduler_config_path); } else { OPENVINO_THROW("Unsupported scheduler type '", scheduler_type, ". Please, manually create scheduler via supported one"); } diff --git a/src/cpp/src/image_generation/schedulers/types.cpp b/src/cpp/src/image_generation/schedulers/types.cpp index 2f7c6d3f25..5a9e5b6865 100644 --- a/src/cpp/src/image_generation/schedulers/types.cpp +++ b/src/cpp/src/image_generation/schedulers/types.cpp @@ -57,6 +57,8 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Schedu param = Scheduler::FLOW_MATCH_EULER_DISCRETE; else if (scheduler_type_str == "PNDMScheduler") param = Scheduler::PNDM; + else if (scheduler_type_str == "EulerAncestralDiscreteScheduler") + param = Scheduler::EULER_ANCESTRAL_DISCRETE; else if (!scheduler_type_str.empty()) { OPENVINO_THROW("Unsupported value for 'scheduler' ", scheduler_type_str); } diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md index 8c922ee644..9762874596 100644 --- a/src/docs/SUPPORTED_MODELS.md +++ b/src/docs/SUPPORTED_MODELS.md @@ -217,6 +217,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel` diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 524ff0f921..bfcb869157 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -1343,15 +1343,18 @@ class Scheduler: FLOW_MATCH_EULER_DISCRETE PNDM + + EULER_ANCESTRAL_DISCRETE """ AUTO: typing.ClassVar[Scheduler.Type] # value = DDIM: typing.ClassVar[Scheduler.Type] # value = + EULER_ANCESTRAL_DISCRETE: typing.ClassVar[Scheduler.Type] # value = EULER_DISCRETE: typing.ClassVar[Scheduler.Type] # value = FLOW_MATCH_EULER_DISCRETE: typing.ClassVar[Scheduler.Type] # value = LCM: typing.ClassVar[Scheduler.Type] # value = LMS_DISCRETE: typing.ClassVar[Scheduler.Type] # value = PNDM: typing.ClassVar[Scheduler.Type] # value = - __members__: typing.ClassVar[dict[str, Scheduler.Type]] # value = {'AUTO': , 'LCM': , 'LMS_DISCRETE': , 'DDIM': , 'EULER_DISCRETE': , 'FLOW_MATCH_EULER_DISCRETE': , 'PNDM': } + __members__: typing.ClassVar[dict[str, Scheduler.Type]] # value = {'AUTO': , 'LCM': , 'LMS_DISCRETE': , 'DDIM': , 'EULER_DISCRETE': , 'FLOW_MATCH_EULER_DISCRETE': , 'PNDM': , 'EULER_ANCESTRAL_DISCRETE': } def __eq__(self, other: typing.Any) -> bool: ... def __getstate__(self) -> int: diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index f5347c279d..311f3f3760 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -198,7 +198,8 @@ void init_image_generation_pipelines(py::module_& m) { .value("DDIM", ov::genai::Scheduler::Type::DDIM) .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE) .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE) - .value("PNDM", ov::genai::Scheduler::Type::PNDM); + .value("PNDM", ov::genai::Scheduler::Type::PNDM) + .value("EULER_ANCESTRAL_DISCRETE", ov::genai::Scheduler::Type::EULER_ANCESTRAL_DISCRETE); image_generation_scheduler.def_static("from_config", &ov::genai::Scheduler::from_config, py::arg("scheduler_config_path"), diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index c3df84925b..316c9d0b89 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -421,7 +421,7 @@ def get_vae_decoder_step_count(self): scheduler_type = data.get("scheduler", ["", ""])[1] if (scheduler_type not in ["LCMScheduler", "DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler", "EulerDiscreteScheduler", - "FlowMatchEulerDiscreteScheduler"]): + "FlowMatchEulerDiscreteScheduler", "EulerAncestralDiscreteScheduler"]): scheduler = openvino_genai.Scheduler.from_config(model_path / "scheduler/scheduler_config.json", openvino_genai.Scheduler.Type.DDIM) log.warning(f'Type of scheduler {scheduler_type} is unsupported. Please, be aware that it will be replaced to DDIMScheduler') From 17f4eb32a1586aec8f42183e0667348b2cbd2fef Mon Sep 17 00:00:00 2001 From: Sofya Balandina Date: Thu, 19 Dec 2024 07:45:17 +0000 Subject: [PATCH 4/9] fill prompt for sampler analysis with real tokens in VLM pipeline (#1247) + add missed token, if prev generation was finished because max length was reached --- src/cpp/src/utils.cpp | 8 +++ src/cpp/src/utils.hpp | 2 + .../src/visual_language/inputs_embedder.cpp | 56 ++++++++++++------- .../src/visual_language/inputs_embedder.hpp | 8 ++- src/cpp/src/visual_language/pipeline.cpp | 12 ++-- 5 files changed, 59 insertions(+), 27 deletions(-) diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 9fa14b7f9f..be9fc972dc 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -381,6 +381,14 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t se } } +ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, int64_t add_to_front) { + ov::Tensor new_tensor = ov::Tensor{ov::element::i64, {base_tensor.get_shape().at(0), base_tensor.get_shape().at(1) + 1}}; + auto new_tensor_data = new_tensor.data(); + new_tensor_data[0] = add_to_front; + std::copy_n(base_tensor.data(), base_tensor.get_size(), new_tensor_data + 1); + return new_tensor; +} + void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title) { // Specify the name of the environment variable const char* env_var_name = "OPENVINO_LOG_LEVEL"; diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 5342ac427c..96191387cd 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -104,6 +104,8 @@ size_t get_seq_len_axis(std::shared_ptr model); void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional adapter_controller); +ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, int64_t add_to_front); + void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title); } // namespace utils diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index cf77dfce3c..8175d44b16 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -10,6 +10,7 @@ #include "utils.hpp" + namespace { constexpr size_t BATCH_SIZE = 1; @@ -40,10 +41,12 @@ class InputsEmbedder::IInputsEmbedder { // Templated chat history std::string m_templated_chat_history; // Tokenized chat history - std::vector m_tokenized_chat_history; + std::vector m_tokenized_history; // The number of elements, which need to remove from the end of KV cache // removed elements will be added to inputs_ids size_t m_to_remove_from_hist = 0; + // Tail of previous output for LM in chat mode is missing in KV cache. + std::optional m_last_disappeared_token = std::nullopt; public: virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) = 0; @@ -56,26 +59,30 @@ class InputsEmbedder::IInputsEmbedder { return m_tokenizer; } - std::vector get_tokenized_chat_history() const { - return m_tokenized_chat_history; + std::vector get_tokenized_history() const { + return m_tokenized_history; } size_t get_amount_to_remove_from_hist() const { return m_to_remove_from_hist; } - void update_tokenized_chat_history(std::vector encoded_result) { - std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history)); + void update_tokenized_history(std::vector encoded_result, bool token_will_disappear) { + std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history)); m_to_remove_from_hist = 0; + if (token_will_disappear) + m_last_disappeared_token = encoded_result.back(); + else + m_last_disappeared_token = std::nullopt; } virtual void start_chat(const std::string& system_message) { m_is_chat_conversation = true; m_to_remove_from_hist = 0; - if (!m_tokenized_chat_history.empty()) { + if (!m_tokenized_history.empty()) { m_history.clear(); m_templated_chat_history.clear(); - m_tokenized_chat_history.clear(); + m_tokenized_history.clear(); } if (system_message.empty()) { return; @@ -98,7 +105,7 @@ class InputsEmbedder::IInputsEmbedder { m_history.clear(); m_templated_chat_history.clear(); - m_tokenized_chat_history.clear(); + m_tokenized_history.clear(); } protected: @@ -165,37 +172,46 @@ class InputsEmbedder::IInputsEmbedder { // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history // so let's check it out, find the trusted part and use it in on the next step size_t last_same_hist_token = 0; - if (!m_tokenized_chat_history.empty()) { + if (!m_tokenized_history.empty()) { std::set stop_tokens = {m_tokenizer.get_eos_token_id()}; - last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens); + last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens); } - if (m_tokenized_chat_history.empty()) { + if (m_tokenized_history.empty()) { encoded_input_ids = new_chat_tokens; } else if (last_same_hist_token != SIZE_MAX) { - m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token; + m_to_remove_from_hist = m_tokenized_history.size() - last_same_hist_token; + // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it + m_to_remove_from_hist -= m_last_disappeared_token.has_value() ? 1 : 0; ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(), {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token}, new_chat_tokens.data() + last_same_hist_token); - encoded_input_ids = new_tensor; + encoded_input_ids = ov::Tensor(new_chat_tokens.get_element_type(), + {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token}); + new_tensor.copy_to(encoded_input_ids); } else { encoded_input_ids = utils::subtract_chat_tokenized_inputs( {new_chat_tokens}, prev_chat_tokens ).input_ids; + + if (m_last_disappeared_token.has_value()) + encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token); } auto end_tokenizer_time = std::chrono::steady_clock::now(); metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_templated_chat_history = std::move(new_templated_chat_history); - m_tokenized_chat_history.clear(); - std::copy(new_chat_tokens.data(), new_chat_tokens.data() + new_chat_tokens.get_size(), - std::back_inserter(m_tokenized_chat_history)); + m_tokenized_history.clear(); + std::copy_n(new_chat_tokens.data(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history)); } else { auto start_tokenizer_time = std::chrono::steady_clock::now(); encoded_input_ids = m_tokenizer.encode(prompt).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + m_tokenized_history.clear(); + std::copy_n(encoded_input_ids.data(), encoded_input_ids.get_size(), std::back_inserter(m_tokenized_history)); } + return encoded_input_ids; } @@ -1172,12 +1188,12 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const { return m_impl->get_embedding_model(); } -std::vector InputsEmbedder::get_tokenized_chat_history() const { - return m_impl->get_tokenized_chat_history(); +std::vector InputsEmbedder::get_tokenized_history() const { + return m_impl->get_tokenized_history(); } -void InputsEmbedder::update_tokenized_chat_history(std::vector encoded_result) { - return m_impl->update_tokenized_chat_history(encoded_result); +void InputsEmbedder::update_tokenized_history(std::vector encoded_result, bool token_will_disappear) { + return m_impl->update_tokenized_history(encoded_result, token_will_disappear); } size_t InputsEmbedder::get_amount_to_remove_from_hist() const { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 5c5b9d2b81..8c84c6ad43 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -41,16 +41,20 @@ class InputsEmbedder { Tokenizer get_tokenizer() const; // returns tokenized chat history - std::vector get_tokenized_chat_history() const; + std::vector get_tokenized_history() const; + // add new results to tokenized chat history - void update_tokenized_chat_history(std::vector encoded_result); + void update_tokenized_history(std::vector encoded_result, bool token_will_disappear); + // returns amount of elements, which need to remove from the end of the KV cache size_t get_amount_to_remove_from_hist() const; // starts chat and adds optional system_message to chat history void start_chat(const std::string& system_message); + // adds currently generated text to chat history void update_chat_history(const std::string& decoded_results); + // finishes chat and clears a chat history void finish_chat(); private: diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 1ce0cbf210..0d7aebc506 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -17,6 +17,7 @@ #include "utils.hpp" #include "lm_encoding.hpp" + using namespace ov::genai; namespace { @@ -163,19 +164,18 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist(); ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, m_kv_cache_seq_length_axis, std::nullopt); - Sampler sampler = Sampler(m_tokenizer); - std::vector requests; size_t request_id = 0; size_t block_size = 1; // not used bool enable_prefix_caching = false; - auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history(); size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist; size_t inputs_embeds_size = inputs_embeds.get_shape().at(1); + auto tokenized_history = m_inputs_embedder->get_tokenized_history(); ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size }); - std::fill_n(prompt_ids.data(), prompt_ids.get_size(), 0); + std::fill_n(prompt_ids.data(), prompt_ids.get_size(), m_tokenizer.get_pad_token_id()); + std::copy(tokenized_history.begin(), tokenized_history.end(), prompt_ids.data()); SequenceGroup::Ptr sequence_group = std::make_shared(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching); sequence_group->set_sequence_group_ptr(sequence_group); @@ -204,6 +204,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }}; std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), history_size); + Sampler sampler = Sampler(m_tokenizer); + ov::genai::EncodedResults encoded_result; int32_t m_selected_beam = 0; std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests, @@ -243,7 +245,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { decoded.perf_metrics.m_evaluated = false; decoded.perf_metrics.evaluate_statistics(generate_start_time); - m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0]); + m_inputs_embedder->update_tokenized_history(encoded_result.tokens[0], requests[0]->get_finished_sequences()[0]->get_finish_reason() == GenerationFinishReason::LENGTH); return decoded; } From e1f910ddef54728cc1147c9f839a09cdc176c2dd Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 19 Dec 2024 11:48:09 +0100 Subject: [PATCH 5/9] Whisper pipeline: cache models in python tests (#1389) Ticket: 159277 --- tests/python_tests/test_whisper_generate_api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py index 5a68dd98b6..9a117bc939 100644 --- a/tests/python_tests/test_whisper_generate_api.py +++ b/tests/python_tests/test_whisper_generate_api.py @@ -25,7 +25,9 @@ def run_gc_after_test(): yield gc.collect() -@functools.lru_cache(1) +# used whisper models are relatively small +# cache them in memory to speedup tests +@functools.lru_cache(3) def read_whisper_model(params, **tokenizer_kwargs): model_id, path = params From 0be7b3df3d28fa6c9009f1f070851b21bac4a4bf Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 19 Dec 2024 12:06:50 +0100 Subject: [PATCH 6/9] Whisper pipeline: implement 'initial_prompt' and 'hotwords' parameters (#1378) Adds: * `initial_prompt` parameter ([faster_whisper reference](https://github.com/SYSTRAN/faster-whisper/blob/203dddb047fd2c3ed2a520fe1416467a527e0f37/faster_whisper/transcribe.py#L732)) - injects initial prompt tokens as a previous transcription into the first processing window * `hotwords` parameter ([faster_whisper reference](https://github.com/SYSTRAN/faster-whisper/blob/203dddb047fd2c3ed2a520fe1416467a527e0f37/faster_whisper/transcribe.py#L768)) - injects hotwords tokens as a previous transcription into the all processing windows * Whisper pipeline usage notes in samples Closes https://github.com/openvinotoolkit/openvino.genai/issues/1150 Ticket: 156888 --- .../cpp/whisper_speech_recognition/README.md | 85 ++++++++++++++++++ .../whisper_speech_recognition.cpp | 1 + .../whisper_speech_recognition/README.md | 87 ++++++++++++++++++ .../whisper_speech_recognition.py | 7 +- .../genai/whisper_generation_config.hpp | 34 ++++++- src/cpp/src/whisper/context_tokens.cpp | 89 +++++++++++++++++++ src/cpp/src/whisper/context_tokens.hpp | 25 ++++++ src/cpp/src/whisper/whisper.cpp | 24 +++-- src/cpp/src/whisper/whisper.hpp | 2 + src/cpp/src/whisper_generation_config.cpp | 5 +- src/cpp/src/whisper_pipeline.cpp | 6 ++ src/cpp/src/whisper_pipeline_static.cpp | 3 + .../openvino_genai/py_openvino_genai.pyi | 53 +++++++++++ src/python/py_whisper_pipeline.cpp | 28 ++++++ .../python_tests/test_whisper_generate_api.py | 25 ++++++ 15 files changed, 460 insertions(+), 14 deletions(-) create mode 100644 src/cpp/src/whisper/context_tokens.cpp create mode 100644 src/cpp/src/whisper/context_tokens.hpp diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md index 773135b648..d649266613 100644 --- a/samples/cpp/whisper_speech_recognition/README.md +++ b/samples/cpp/whisper_speech_recognition/README.md @@ -33,6 +33,91 @@ timestamps: [0, 2] text: How are you doing today? See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models. +# Whisper pipeline usage + +```c++ +#include "openvino/genai/whisper_pipeline.hpp" + +ov::genai::WhisperPipeline pipeline(model_dir, "CPU"); +// Pipeline expects normalized audio with Sample Rate of 16kHz +ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav"); +auto result = pipeline.generate(raw_speech); +// How are you doing today? +``` + +### Transcription + +Whisper pipeline predicts the language of the source audio automatically. + +```c++ +ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav"); +auto result = pipeline.generate(raw_speech); +// How are you doing today? + +raw_speech = read_wav("fr_sample.wav"); +result = pipeline.generate(raw_speech); +// Il s'agit d'une entité très complexe qui consiste... +``` + +If the source audio languange is know in advance, it can be specified as an argument to `generate` method: + +```c++ +ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav"); +auto result = pipeline.generate(raw_speech, ov::genai::language("<|en|>")); +// How are you doing today? + +raw_speech = read_wav("fr_sample.wav"); +result = pipeline.generate(raw_speech, ov::genai::language("<|fr|>")); +// Il s'agit d'une entité très complexe qui consiste... +``` + +### Translation + +By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate": + +```c++ +ov::genai::RawSpeechInput raw_speech = read_wav("fr_sample.wav"); +auto result = pipeline.generate(raw_speech, ov::genai::task("translate")); +// It is a very complex entity that consists... +``` + +### Timestamps prediction + +The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument: + +```C++ +ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav"); +auto result = pipeline.generate(raw_speech, ov::genai::return_timestamps(true)); + +std::cout << std::setprecision(2); +for (auto& chunk : *result.chunks) { + std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; +} +// timestamps: [0, 2] text: How are you doing today? +``` + +### Long-Form audio Transcription + +The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length. +Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other. + +### Initial prompt and hotwords + +Whisper pipeline has `initial_prompt` and `hotwords` generate arguments: +* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window +* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows + +The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles: + +```c++ +auto result = pipeline.generate(raw_speech); +// He has gone and gone for good answered Paul Icrom who... + +result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome")); +// He has gone and gone for good answered Polychrome who... +``` + + ### Troubleshooting #### Empty or rubbish output diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp index 31d3f8c551..3df17a77f5 100644 --- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp +++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp @@ -28,6 +28,7 @@ int main(int argc, char* argv[]) try { std::cout << result << "\n"; + std::cout << std::setprecision(2); for (auto& chunk : *result.chunks) { std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; } diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md index 158bd18311..aeb46444bf 100644 --- a/samples/python/whisper_speech_recognition/README.md +++ b/samples/python/whisper_speech_recognition/README.md @@ -40,6 +40,93 @@ timestamps: [0, 2] text: How are you doing today? See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models. +# Whisper pipeline usage + +```python +import openvino_genai +import librosa + +def read_wav(filepath): + raw_speech, samplerate = librosa.load(filepath, sr=16000) + return raw_speech.tolist() + +pipe = openvino_genai.WhisperPipeline(model_dir, "CPU") +# Pipeline expects normalized audio with Sample Rate of 16kHz +raw_speech = read_wav('how_are_you_doing_today.wav') +result = pipe.generate(raw_speech) +# How are you doing today? +``` + +### Transcription + +Whisper pipeline predicts the language of the source audio automatically. + +```python +raw_speech = read_wav('how_are_you_doing_today.wav') +result = pipe.generate(raw_speech) +# How are you doing today? + +raw_speech = read_wav('fr_sample.wav') +result = pipe.generate(raw_speech) +# Il s'agit d'une entité très complexe qui consiste... +``` + +If the source audio languange is know in advance, it can be specified as an argument to `generate` method: + +```python +raw_speech = read_wav("how_are_you_doing_today.wav") +result = pipe.generate(raw_speech, language="<|en|>") +# How are you doing today? + +raw_speech = read_wav("fr_sample.wav") +result = pipe.generate(raw_speech, language="<|fr|>") +# Il s'agit d'une entité très complexe qui consiste... +``` + +### Translation + +By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate": + +```python +raw_speech = read_wav("fr_sample.wav") +result = pipe.generate(raw_speech, task="translate") +# It is a very complex entity that consists... +``` + +### Timestamps prediction + +The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument: + +```python +raw_speech = read_wav("how_are_you_doing_today.wav") +result = pipe.generate(raw_speech, return_timestamps=True) + +for chunk in result.chunks: + print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}") +# timestamps: [0.00, 2.00] text: How are you doing today? +``` + +### Long-Form audio Transcription + +The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length. +Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other. + +### Initial prompt and hotwords + +Whisper pipeline has `initial_prompt` and `hotwords` generate arguments: +* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window +* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows + +The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles: + +```python +result = pipe.generate(raw_speech) +# He has gone and gone for good answered Paul Icrom who... + +result = pipe.generate(raw_speech, initial_prompt="Polychrome") +# He has gone and gone for good answered Polychrome who... +``` + ### Troubleshooting #### Empty or rubbish output diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py index 3fddfc8ffa..9cf3be5fa1 100755 --- a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py +++ b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py @@ -18,7 +18,7 @@ def main(): parser.add_argument("wav_file_path") args = parser.parse_args() - device = "CPU" # GPU can be used as well + device = "CPU" # GPU, NPU can be used as well pipe = openvino_genai.WhisperPipeline(args.model_dir, device) config = pipe.get_generation_config() @@ -34,8 +34,9 @@ def main(): print(result) - for chunk in result.chunks: - print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}") + if result.chunks: + for chunk in result.chunks: + print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}") if "__main__" == __name__: diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp index 37b23cde74..44d611923d 100644 --- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp +++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp @@ -3,8 +3,8 @@ #pragma once -#include #include +#include #include "openvino/genai/tokenizer.hpp" #include "openvino/runtime/compiled_model.hpp" @@ -46,6 +46,9 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig { // Transcribe token id. int64_t transcribe_token_id = 50359; + // Corresponds to the ”<|startofprev|>” token. + int64_t prev_sot_token_id = 50361; + // No timestamps token id. int64_t no_timestamps_token_id = 50363; @@ -75,6 +78,32 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig { // Note that a segment of text refers to a sequence of one or more words, rather than individual words. bool return_timestamps = false; + /* + * Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing + * window. Can be used to steer the model to use particular spellings or styles. + * + * Example: + * auto result = pipeline.generate(raw_speech); + * // He has gone and gone for good answered Paul Icrom who... + * + * auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome")); + * // He has gone and gone for good answered Polychrome who... + */ + std::optional initial_prompt = std::nullopt; + + /* + * Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows. + * Can be used to steer the model to use particular spellings or styles. + * + * Example: + * auto result = pipeline.generate(raw_speech); + * // He has gone and gone for good answered Paul Icrom who... + * + * auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); + * // He has gone and gone for good answered Polychrome who... + */ + std::optional hotwords = std::nullopt; + // A list containing tokens that will be suppressed at the beginning of the sampling process. std::vector begin_suppress_tokens; @@ -111,9 +140,12 @@ static constexpr ov::Property pad_token_id{"pad_token_id"}; static constexpr ov::Property transcribe_token_id{"transcribe_token_id"}; static constexpr ov::Property translate_token_id{"translate_token_id"}; static constexpr ov::Property no_timestamps_token_id{"no_timestamps_token_id"}; +static constexpr ov::Property prev_sot_token_id{"prev_sot_token_id"}; static constexpr ov::Property language{"language"}; static constexpr ov::Property task{"task"}; static constexpr ov::Property return_timestamps{"return_timestamps"}; +static constexpr ov::Property initial_prompt{"initial_prompt"}; +static constexpr ov::Property hotwords{"hotwords"}; static constexpr ov::Property> lang_to_id{"lang_to_id"}; } // namespace genai diff --git a/src/cpp/src/whisper/context_tokens.cpp b/src/cpp/src/whisper/context_tokens.cpp new file mode 100644 index 0000000000..75ee442551 --- /dev/null +++ b/src/cpp/src/whisper/context_tokens.cpp @@ -0,0 +1,89 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "context_tokens.hpp" + +namespace { +std::pair, float> tokenize(std::string&& text, + const ov::genai::WhisperGenerationConfig& config, + ov::genai::Tokenizer& tokenizer) { + if (text.empty()) { + return {{}, 0.0f}; + } + + auto start_time = std::chrono::steady_clock::now(); + auto encoded = tokenizer.encode(text, ov::genai::add_special_tokens(false)); + auto duration = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - start_time); + + auto input_ids = encoded.input_ids; + auto input_ids_data = input_ids.data(); + + std::vector prompt_tokens; + prompt_tokens.reserve(input_ids.get_size()); + + // even with ov::genai::add_special_tokens(false) tokenizer adds next special tokens. Ticket: 159569 + std::set special_tokens{config.decoder_start_token_id, config.eos_token_id, config.no_timestamps_token_id}; + + for (size_t i = 0; i < input_ids.get_size(); i++) { + if (special_tokens.count(input_ids_data[i])) { + continue; + } + + prompt_tokens.emplace_back(input_ids_data[i]); + } + + return {prompt_tokens, duration}; +} +} // namespace + +namespace ov { +namespace genai { + +std::pair prepare_context_tokens(const WhisperGenerationConfig& config, + Tokenizer& tokenizer) { + WhisperContextTokens context_tokens; + float duration = 0.0f; + + if (config.initial_prompt.has_value()) { + auto [initial_prompt_tokens, initial_prompt_duration] = + tokenize(" " + *config.initial_prompt, config, tokenizer); + context_tokens.initial_prompt = std::move(initial_prompt_tokens); + duration += initial_prompt_duration; + } + + if (config.hotwords.has_value()) { + auto [hotwords_tokens, hotwords_duration] = tokenize(" " + *config.hotwords, config, tokenizer); + context_tokens.hotwords = std::move(hotwords_tokens); + duration += hotwords_duration; + } + + return {context_tokens, duration}; +} + +std::vector get_prompt_tokens(const WhisperContextTokens& context_tokens, + const WhisperGenerationConfig& config, + size_t chunk_offset) { + bool should_add_initial_prompt = !context_tokens.initial_prompt.empty() && chunk_offset == 0; + bool should_add_hotwords = !context_tokens.hotwords.empty(); + + if (!should_add_initial_prompt && !should_add_hotwords) { + return {}; + } + + std::vector prompt_tokens{config.prev_sot_token_id}; + + if (should_add_initial_prompt) { + prompt_tokens.insert(prompt_tokens.end(), + context_tokens.initial_prompt.begin(), + context_tokens.initial_prompt.end()); + } + + if (should_add_hotwords) { + prompt_tokens.insert(prompt_tokens.end(), context_tokens.hotwords.begin(), context_tokens.hotwords.end()); + } + + return prompt_tokens; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/context_tokens.hpp b/src/cpp/src/whisper/context_tokens.hpp new file mode 100644 index 0000000000..0042ba8136 --- /dev/null +++ b/src/cpp/src/whisper/context_tokens.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/perf_metrics.hpp" +#include "openvino/genai/whisper_generation_config.hpp" + +namespace ov { +namespace genai { + +struct WhisperContextTokens { + std::vector initial_prompt; + std::vector hotwords; +}; + +std::pair prepare_context_tokens(const WhisperGenerationConfig& config, + Tokenizer& tokenizer); + +std::vector get_prompt_tokens(const WhisperContextTokens& context_tokens, + const WhisperGenerationConfig& config, + size_t chunk_offset); + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 355ccc619b..9d6aa698ce 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -8,6 +8,7 @@ #include #include +#include "context_tokens.hpp" #include "logit_processor.hpp" #include "openvino/genai/perf_metrics.hpp" #include "openvino/genai/whisper_generation_config.hpp" @@ -175,11 +176,11 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state, return output_token; } -std::vector prepare_init_ids(ov::Tensor& encoder_hidden_state, - ov::InferRequest decoder, - const ov::genai::WhisperGenerationConfig& config, - const bool return_timestamps, - ov::genai::RawPerfMetrics& raw_metrics) { +std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, + ov::InferRequest decoder, + const ov::genai::WhisperGenerationConfig& config, + const bool return_timestamps, + ov::genai::RawPerfMetrics& raw_metrics) { if (!config.is_multilingual) { if (return_timestamps) { return std::vector{config.decoder_start_token_id}; @@ -290,6 +291,7 @@ namespace genai { WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config, const ov::genai::WhisperConfig& model_config, + const WhisperContextTokens& context_tokens, const RawSpeechInput& raw_speech, ov::genai::WhisperInitializedModels& models, WhisperFeatureExtractor& feature_extractor, @@ -313,7 +315,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& // long-form audio processing requires timestamps to be enabled const bool return_timestamps = config.return_timestamps || !is_shortform; - std::vector init_ids; + std::vector init_tokens; std::vector& output_tokens = result.output_tokens; std::vector segments; @@ -335,14 +337,18 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& raw_metrics); // prepare init_ids just once for whole input - if (init_ids.empty()) { - init_ids = prepare_init_ids(hidden_state_tensor, models.decoder, config, return_timestamps, raw_metrics); + if (init_tokens.empty()) { + init_tokens = + prepare_init_tokens(hidden_state_tensor, models.decoder, config, return_timestamps, raw_metrics); } + std::vector chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset); + chunk_init_tokens.insert(chunk_init_tokens.end(), init_tokens.begin(), init_tokens.end()); + auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor, config, models, - init_ids, + chunk_init_tokens, max_new_tokens - output_tokens.size(), return_timestamps, raw_metrics, diff --git a/src/cpp/src/whisper/whisper.hpp b/src/cpp/src/whisper/whisper.hpp index 4904edf925..81f559db9f 100644 --- a/src/cpp/src/whisper/whisper.hpp +++ b/src/cpp/src/whisper/whisper.hpp @@ -5,6 +5,7 @@ #include +#include "context_tokens.hpp" #include "openvino/genai/whisper_generation_config.hpp" #include "openvino/genai/whisper_pipeline.hpp" #include "whisper_config.hpp" @@ -28,6 +29,7 @@ struct WhisperGenerateResult { WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config, const ov::genai::WhisperConfig& model_config, + const WhisperContextTokens& context_tokens, const ov::genai::RawSpeechInput& raw_speech, ov::genai::WhisperInitializedModels& models, ov::genai::WhisperFeatureExtractor& feature_extractor, diff --git a/src/cpp/src/whisper_generation_config.cpp b/src/cpp/src/whisper_generation_config.cpp index 0fba4e962f..beb663caaf 100644 --- a/src/cpp/src/whisper_generation_config.cpp +++ b/src/cpp/src/whisper_generation_config.cpp @@ -8,8 +8,8 @@ #include #include -#include "utils.hpp" #include "json_utils.hpp" +#include "utils.hpp" namespace ov { namespace genai { @@ -31,6 +31,7 @@ WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& js read_json_param(data, "pad_token_id", pad_token_id); read_json_param(data, "no_timestamps_token_id", no_timestamps_token_id); read_json_param(data, "max_initial_timestamp_index", max_initial_timestamp_index); + read_json_param(data, "prev_sot_token_id", prev_sot_token_id); read_json_param(data, "is_multilingual", is_multilingual); if (is_multilingual) { @@ -73,6 +74,8 @@ void WhisperGenerationConfig::update_generation_config(const ov::AnyMap& config_ read_anymap_param(config_map, "lang_to_id", lang_to_id); read_anymap_param(config_map, "task", task); read_anymap_param(config_map, "return_timestamps", return_timestamps); + read_anymap_param(config_map, "initial_prompt", initial_prompt); + read_anymap_param(config_map, "hotwords", hotwords); } size_t WhisperGenerationConfig::get_max_new_tokens(size_t prompt_length) const { diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp index d472a20238..f0fb34cdf6 100644 --- a/src/cpp/src/whisper_pipeline.cpp +++ b/src/cpp/src/whisper_pipeline.cpp @@ -9,6 +9,7 @@ #include #include "utils.hpp" +#include "whisper/context_tokens.hpp" #include "whisper/streamer.hpp" #include "whisper/whisper.hpp" #include "whisper/whisper_config.hpp" @@ -91,8 +92,11 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi streamer_ptr = std::make_shared(m_tokenizer, *callback); } + auto [context_tokens, tokenization_duration_microseconds] = prepare_context_tokens(config, m_tokenizer); + auto generate_result = ov::genai::whisper_generate(config, m_model_config, + context_tokens, raw_speech_input, m_models, m_feature_extractor, @@ -102,6 +106,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi generate_result.perf_metrics.raw_metrics.detokenization_durations.emplace_back( PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time)); + result.perf_metrics.raw_metrics.tokenization_durations.emplace_back(tokenization_duration_microseconds); + result.perf_metrics = generate_result.perf_metrics; auto& segments = generate_result.segments; diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp index 136819fa01..dc26789846 100644 --- a/src/cpp/src/whisper_pipeline_static.cpp +++ b/src/cpp/src/whisper_pipeline_static.cpp @@ -579,6 +579,9 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; config.validate(); + OPENVINO_ASSERT(!config.initial_prompt.has_value(), "'initial_prompt' parameter is not supported on NPU device."); + OPENVINO_ASSERT(!config.hotwords.has_value(), "'hotwords' parameter is not supported on NPU device."); + std::shared_ptr streamer_ptr; if (auto streamer_obj = std::get_if(&streamer)) { streamer_ptr = nullptr; diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index bfcb869157..3d27b23052 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -1948,6 +1948,9 @@ class WhisperGenerationConfig: :param no_timestamps_token_id: No timestamps token id. :type no_timestamps_token_id: int + :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token. + :type prev_sot_token_id: int + :param is_multilingual: :type is_multilingual: bool @@ -1976,10 +1979,34 @@ class WhisperGenerationConfig: then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. Note that a segment of text refers to a sequence of one or more words, rather than individual words. :type return_timestamps: bool + + :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing + window. Can be used to steer the model to use particular spellings or styles. + + Example: + auto result = pipeline.generate(raw_speech); + // He has gone and gone for good answered Paul Icrom who... + + auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome")); + // He has gone and gone for good answered Polychrome who... + :type initial_prompt: Optional[str] + + :param hotwords: Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows. + Can be used to steer the model to use particular spellings or styles. + + Example: + auto result = pipeline.generate(raw_speech); + // He has gone and gone for good answered Paul Icrom who... + + auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); + // He has gone and gone for good answered Polychrome who... + :type hotwords: Optional[str] """ begin_suppress_tokens: list[int] decoder_start_token_id: int eos_token_id: int + hotwords: str | None + initial_prompt: str | None is_multilingual: bool lang_to_id: dict[str, int] language: str | None @@ -1988,6 +2015,7 @@ class WhisperGenerationConfig: max_new_tokens: int no_timestamps_token_id: int pad_token_id: int + prev_sot_token_id: int return_timestamps: bool suppress_tokens: list[int] task: str | None @@ -2080,6 +2108,9 @@ class WhisperPipeline: :param no_timestamps_token_id: No timestamps token id. :type no_timestamps_token_id: int + :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token. + :type prev_sot_token_id: int + :param is_multilingual: :type is_multilingual: bool @@ -2108,6 +2139,28 @@ class WhisperPipeline: then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. Note that a segment of text refers to a sequence of one or more words, rather than individual words. :type return_timestamps: bool + + :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing + window. Can be used to steer the model to use particular spellings or styles. + + Example: + auto result = pipeline.generate(raw_speech); + // He has gone and gone for good answered Paul Icrom who... + + auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome")); + // He has gone and gone for good answered Polychrome who... + :type initial_prompt: Optional[str] + + :param hotwords: Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows. + Can be used to steer the model to use particular spellings or styles. + + Example: + auto result = pipeline.generate(raw_speech); + // He has gone and gone for good answered Paul Icrom who... + + auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); + // He has gone and gone for good answered Polychrome who... + :type hotwords: Optional[str] """ def get_generation_config(self) -> WhisperGenerationConfig: ... diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index 49152c03f4..cd42dcf58d 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -103,6 +103,9 @@ auto whisper_generation_config_docstring = R"( :param no_timestamps_token_id: No timestamps token id. :type no_timestamps_token_id: int + :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token. + :type prev_sot_token_id: int + :param is_multilingual: :type is_multilingual: bool @@ -131,6 +134,28 @@ auto whisper_generation_config_docstring = R"( then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. Note that a segment of text refers to a sequence of one or more words, rather than individual words. :type return_timestamps: bool + + :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing + window. Can be used to steer the model to use particular spellings or styles. + + Example: + auto result = pipeline.generate(raw_speech); + // He has gone and gone for good answered Paul Icrom who... + + auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome")); + // He has gone and gone for good answered Polychrome who... + :type initial_prompt: Optional[str] + + :param hotwords: Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows. + Can be used to steer the model to use particular spellings or styles. + + Example: + auto result = pipeline.generate(raw_speech); + // He has gone and gone for good answered Paul Icrom who... + + auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); + // He has gone and gone for good answered Polychrome who... + :type hotwords: Optional[str] )"; auto streamer_base_docstring = R"( @@ -262,11 +287,14 @@ void init_whisper_pipeline(py::module_& m) { .def_readwrite("transcribe_token_id", &WhisperGenerationConfig::transcribe_token_id) .def_readwrite("max_initial_timestamp_index", &WhisperGenerationConfig::max_initial_timestamp_index) .def_readwrite("no_timestamps_token_id", &WhisperGenerationConfig::no_timestamps_token_id) + .def_readwrite("prev_sot_token_id", &WhisperGenerationConfig::prev_sot_token_id) .def_readwrite("is_multilingual", &WhisperGenerationConfig::is_multilingual) .def_readwrite("language", &WhisperGenerationConfig::language) .def_readwrite("lang_to_id", &WhisperGenerationConfig::lang_to_id) .def_readwrite("task", &WhisperGenerationConfig::task) .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps) + .def_readwrite("initial_prompt", &WhisperGenerationConfig::initial_prompt) + .def_readwrite("hotwords", &WhisperGenerationConfig::hotwords) .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")); py::class_(m, "WhisperRawPerfMetrics", raw_perf_metrics_docstring) diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py index 9a117bc939..1450ef1f2e 100644 --- a/tests/python_tests/test_whisper_generate_api.py +++ b/tests/python_tests/test_whisper_generate_api.py @@ -570,6 +570,31 @@ def test_longform_audio(model_descr, test_sample): assert genai_result.chunks == None +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", + get_samples_from_dataset(length=1), +) +@pytest.mark.precommit +def test_initial_prompt_hotwords(model_descr, test_sample): + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + result = pipe.generate(test_sample) + + assert "Joel Keaton" in result.texts[0] + assert "Joel Kyton" not in result.texts[0] + + result = pipe.generate(test_sample, initial_prompt="Joel Kyton") + + assert "Joel Keaton" not in result.texts[0] + assert "Joel Kyton" in result.texts[0] + + result = pipe.generate(test_sample, hotwords="Joel Kyton") + + assert "Joel Keaton" not in result.texts[0] + assert "Joel Kyton" in result.texts[0] + + @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( "test_sample", From c13e8e5a2effdb7834a40a10586dfdd39e72bd2a Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Thu, 19 Dec 2024 19:34:57 +0400 Subject: [PATCH 7/9] [ SD ] Fix of scheduler config for main_pipeline (#1406) --- .../speculative_decoding_impl.cpp | 17 +++++++++-------- .../utils/paged_attention_transformations.cpp | 2 +- .../utils/paged_attention_transformations.hpp | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index 4a0748b5c0..46b7b106a6 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -46,14 +46,15 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con draft_scheduler_config = is_scheduler_undefined ? main_scheduler_config : draft_model_desc.scheduler_config; if (is_scheduler_undefined) { // split KV cache to 2 caches for main and draft models - size_t main_model_cache_size = utils::get_kv_cache_size(main_model), - draft_model_cache_size = utils::get_kv_cache_size(draft_model); - auto k = static_cast(draft_model_cache_size) / (main_model_cache_size + draft_model_cache_size); + size_t main_model_hidden_size = utils::get_hidden_size(main_model), + draft_model_hidden_size = utils::get_hidden_size(draft_model); + auto k = static_cast(draft_model_hidden_size) / (main_model_hidden_size + draft_model_hidden_size); - size_t main_cache_size = main_scheduler_config.cache_size * (1 - k), + size_t main_cache_size = std::ceil(main_scheduler_config.cache_size * (1.f - k)), draft_cache_size = main_scheduler_config.cache_size - main_cache_size; + OPENVINO_ASSERT(main_cache_size > 0, "KV cache model cache size should be > 0"); if (draft_cache_size == 0) { - main_cache_size -= main_cache_size > 1 ? 1 : 0; + main_cache_size -= (main_cache_size > 1 ? 1 : 0); draft_cache_size = 1; } @@ -63,7 +64,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con ov::AnyMap draft_properties = draft_model_desc.properties == ov::AnyMap{} ? compile_properties : draft_model_desc.properties; - DeviceConfig main_device_config(core, main_scheduler_config, main_device, compile_properties), + DeviceConfig main_device_config(core, main_scheduler_config_updated, main_device, compile_properties), draft_device_config(core, draft_scheduler_config, draft_device, draft_properties); utils::set_kv_cache_type_and_shape(main_model, main_device_config); @@ -82,7 +83,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode m_main_pipeline = std::make_shared(core, main_model, main_model_tokenizer, main_model_desc.generation_config, - main_device_config, main_scheduler_config, main_device, compile_properties, true); + main_device_config, main_scheduler_config_updated, main_device, compile_properties, true); m_draft_pipeline = std::make_shared(core, draft_model, draft_model_tokenizer, draft_model_desc.generation_config, draft_device_config, draft_scheduler_config, draft_device, draft_properties, false); @@ -278,4 +279,4 @@ SpeculativeDecodingMetrics ContinuousBatchingPipeline::SpeculativeDecodingImpl::get_speculative_decoding_metrics() { return m_sd_metrics; }; -} \ No newline at end of file +} diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp index 53690f770c..16c9556151 100644 --- a/src/cpp/src/utils/paged_attention_transformations.cpp +++ b/src/cpp/src/utils/paged_attention_transformations.cpp @@ -16,7 +16,7 @@ inline ov::PartialShape to_partial_with_dyn_0_dim(const ov::Shape& static_shape) return partial_shape; } -size_t get_kv_cache_size(const std::shared_ptr model) { +size_t get_hidden_size(const std::shared_ptr model) { const auto& parameters = model->get_parameters(); // extract num_kv_heads and head_size size_t kv_caches_inputs_offset = 2; diff --git a/src/cpp/src/utils/paged_attention_transformations.hpp b/src/cpp/src/utils/paged_attention_transformations.hpp index 3bc423d7bc..88ac0876c5 100644 --- a/src/cpp/src/utils/paged_attention_transformations.hpp +++ b/src/cpp/src/utils/paged_attention_transformations.hpp @@ -23,7 +23,7 @@ void apply_paged_attention_transformations(std::shared_ptr model, Dev void apply_paged_attention_transformations(std::shared_ptr model, bool per_layer_cache_control = false); -size_t get_kv_cache_size(const std::shared_ptr model); +size_t get_hidden_size(const std::shared_ptr model); void set_kv_cache_type_and_shape(std::shared_ptr model, DeviceConfig& device_config); From 19c66f5d3c316f0d54b1e4f2594d72b3a4add018 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Thu, 19 Dec 2024 18:06:27 +0100 Subject: [PATCH 8/9] Fail gracefully when openvino_tokenizer.xml is not available (#1413) Is was failing with segfault. Now fails more gracefully ``` Check 'm_ireq_queue_tokenizer' failed at .../src/cpp/src/tokenizer.cpp:387: Either openvino_tokenizer.xml was not provided or it was not loaded correctly. Tokenizer::encode is not available ``` CVS-158884 --- src/cpp/src/tokenizer.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 642236d32a..ed6fbc0a06 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -194,10 +194,16 @@ class Tokenizer::TokenizerImpl { void setupTokenizer(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { auto [ov_tokenizer, ov_detokenizer] = models; + OPENVINO_ASSERT(ov_tokenizer || ov_detokenizer, "Neither tokenizer nor detokenzier models were provided"); - m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1; auto core = get_core_singleton(); std::string device = "CPU"; // only CPU is supported for now + + std::string version_str; + utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str); + // Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5 + m_older_than_24_5 = version_str.empty(); + if (ov_tokenizer) { ov::pass::Manager manager; manager.register_pass(); @@ -230,7 +236,8 @@ class Tokenizer::TokenizerImpl { if (m_tokenizer) { // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup. encode("non empty string").input_ids; - if (m_detokenizer) + } + if (m_detokenizer) { decode({1, 33, 199, 42, 42}); } @@ -377,6 +384,9 @@ class Tokenizer::TokenizerImpl { } TokenizedInputs encode(std::string prompt, const ov::AnyMap& tokenization_params = {}) { + OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. " + "Tokenizer::encode is not available"); + CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_tokenizer.get()); set_state_if_necessary(infer_request_guard, tokenization_params); size_t batch_size = 1; @@ -390,6 +400,8 @@ class Tokenizer::TokenizerImpl { } TokenizedInputs encode(std::vector& prompts, const ov::AnyMap& tokenization_params = {}) { + OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. " + "Tokenizer::encode is not available"); TokenizedInputs unpadded; { CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_tokenizer.get()); From 4d18f8b264c79ddce3c2dc0997992c26ab5c6c5f Mon Sep 17 00:00:00 2001 From: Sofya Balandina Date: Fri, 20 Dec 2024 08:03:49 +0000 Subject: [PATCH 9/9] Make Sampler a member of the class for llm/vlm pipelines (#1412) cherry-pick https://github.com/openvinotoolkit/openvino.genai/pull/1347 to master --- src/cpp/src/llm_pipeline.cpp | 12 +++++++++--- src/cpp/src/lm_encoding.cpp | 3 +++ src/cpp/src/sampler.hpp | 7 ++++++- src/cpp/src/visual_language/pipeline.cpp | 14 ++++++++++++-- tests/python_tests/test_chat_generate_api.py | 7 +++++-- 5 files changed, 35 insertions(+), 8 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 6d9aae30fa..6fdb8ac1cd 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -45,6 +45,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF; size_t m_to_remove_from_hist = 0; size_t m_kv_cache_seq_length_axis = 2; + Sampler m_sampler; StatefulLLMPipeline( const ov::InferRequest& request, @@ -75,7 +76,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { const std::string& device, const ov::AnyMap& config, const ov::genai::GenerationConfig& generation_config - ) : LLMPipelineImplBase(tokenizer, generation_config) { + ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) { ov::Core core; ov::CompiledModel compiled_model; auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config); @@ -96,6 +97,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); + + m_sampler.set_seed(m_generation_config.rng_seed); } StatefulLLMPipeline( @@ -358,9 +361,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { requests.push_back(sequence_group); } - Sampler sampler = Sampler(m_tokenizer); + if (m_sampler.get_seed() != config.rng_seed) { + m_sampler.set_seed(config.rng_seed); + } + std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr, - sampler, requests, position_ids, std::nullopt, m_selected_beam); + m_sampler, requests, position_ids, std::nullopt, m_selected_beam); } if (is_chat_conversation) { diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index 3ab041fa58..62c53cace4 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -247,6 +247,9 @@ std::pair get_lm_encoded_results( // next_selected_beam = sampler.last_selected_beam(request); } + for (SequenceGroup::Ptr sequence_group : sequence_groups) + sampler.clear_request_info(sequence_group->get_request_id()); + return {results, next_selected_beam}; } diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 0f7876cbf9..08a9863e0a 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -55,6 +55,7 @@ class Sampler { std::map m_beam_search_info; std::mt19937 rng_engine; + size_t seed = rng_engine.default_seed; // { request_id, logit_processor } std::map m_logit_processors; @@ -65,7 +66,11 @@ class Sampler { Sampler(Tokenizer & tokenizer) : m_tokenizer(tokenizer) {}; SamplerOutput sample(std::vector & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false); - void set_seed(size_t seed) { rng_engine.seed(seed); } + void set_seed(size_t new_seed) { + rng_engine.seed(new_seed); + seed = new_seed; + } + size_t get_seed() { return seed; } void clear_request_info(uint64_t request_id); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 0d7aebc506..7bf1c1070a 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -67,6 +67,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { float m_load_time_ms = 0; // Axis num in kv cache from m_language model, which contains information about history len size_t m_kv_cache_seq_length_axis = 2; + // Component for applying sampling to lm outputs + Sampler m_sampler; VLMPipelineImpl( const std::filesystem::path& models_dir, @@ -105,6 +107,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { if (m_generation_config.eos_token_id == -1) { m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); } + + m_sampler = Sampler(m_tokenizer); + m_sampler.set_seed(m_generation_config.rng_seed); } VLMPipelineImpl( @@ -140,6 +145,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { if (m_generation_config.eos_token_id == -1) { m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); } + + m_sampler = Sampler(m_tokenizer); + m_sampler.set_seed(m_generation_config.rng_seed); } VLMDecodedResults generate( @@ -204,11 +212,13 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }}; std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), history_size); - Sampler sampler = Sampler(m_tokenizer); + if (m_sampler.get_seed() != generation_config.rng_seed) { + m_sampler.set_seed(generation_config.rng_seed); + } ov::genai::EncodedResults encoded_result; int32_t m_selected_beam = 0; - std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests, + std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests, position_ids, m_embedding, std::nullopt); auto decode_start_time = std::chrono::steady_clock::now(); diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py index 9260e671d6..d9661e538b 100644 --- a/tests/python_tests/test_chat_generate_api.py +++ b/tests/python_tests/test_chat_generate_api.py @@ -187,10 +187,13 @@ def test_set_chat_template(): model_descr = get_chat_models_list()[0] model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) pipe.get_tokenizer().set_chat_template("{% for message in messages %}{{ message['content'] }}{% endfor %}") + config = ov_genai.GenerationConfig() + config.max_new_tokens = 1 + config.do_sample = False pipe.start_chat() - generated = pipe.generate("a", max_new_tokens=1) + generated = pipe.generate("a", config) pipe.finish_chat() - reference = pipe.generate("a", max_new_tokens=1) + reference = pipe.generate("a", config) assert generated == reference prompts = [