From f598639fa23530457aece2d0f9a0527f82be0c3f Mon Sep 17 00:00:00 2001 From: tongqiu Date: Tue, 17 Dec 2024 22:52:43 +0800 Subject: [PATCH 1/6] Add workaround for MSVC mutex constructor issue (#1367) This issue is a MSVC compiler bug affecting certain versions of Visual Studio 2022. When using `std::mutex` a null dereference may occur, leading to a silent crash in Release mode, as illustrated in the image below. ![mutex](https://github.com/user-attachments/assets/07331f59-7e6d-47b4-a72a-887e01817fa8) Adding the compiler option `/D"_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR" `serves as a workaround for this problem. Reference: https://hydrogenaud.io/index.php/topic,126070.0.html https://github.com/microsoft/STL/wiki/Changelog#vs-2022-1710 --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35ca895abc..fec8df34af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,13 @@ if(WIN32 OR APPLE) set(CMAKE_DEBUG_POSTFIX "d") endif() +# Workaround for an MSVC compiler issue in some versions of Visual Studio 2022. +# The issue involves a null dereference to a mutex. For details, refer to link https://github.com/microsoft/STL/wiki/Changelog#vs-2022-1710 +if(MSVC AND MSVC_VERSION GREATER_EQUAL 1930 AND MSVC_VERSION LESS 1941) + add_compile_definitions(_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR) +endif() + + add_subdirectory(thirdparty) add_subdirectory(src) if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/samples") From c6af2f12d7e85f14c5b8260f43d03a7a32508ddc Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 17 Dec 2024 18:53:13 +0400 Subject: [PATCH 2/6] [Image to image] PNDM support (#1394) Continuation for https://github.com/openvinotoolkit/openvino.genai/pull/1393 CVS-158967 --- samples/cpp/image_generation/README.md | 4 ++++ samples/cpp/image_generation/inpainting.bmp | 3 +++ samples/python/image_generation/README.md | 4 ++++ src/cpp/src/image_generation/schedulers/pndm.cpp | 8 ++++++++ src/python/openvino_genai/py_openvino_genai.pyi | 5 ++++- src/python/py_image_generation_pipelines.cpp | 3 ++- tools/llm_bench/llm_bench_utils/ov_utils.py | 3 ++- 7 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 samples/cpp/image_generation/inpainting.bmp diff --git a/samples/cpp/image_generation/README.md b/samples/cpp/image_generation/README.md index f8dc21cc39..3dcb64b97c 100644 --- a/samples/cpp/image_generation/README.md +++ b/samples/cpp/image_generation/README.md @@ -143,4 +143,8 @@ And run the sample: `./inpainting ./stable-diffusion-2-inpainting 'Face of a yellow cat, high resolution, sitting on a park bench' image.png mask_image.png` +The resuling image is: + + ![](./inpainting.bmp) + Note, that LoRA, heterogeneous execution and other features of `Text2ImagePipeline` are applicable for `InpaintingPipeline`. diff --git a/samples/cpp/image_generation/inpainting.bmp b/samples/cpp/image_generation/inpainting.bmp new file mode 100644 index 0000000000..b93292e075 --- /dev/null +++ b/samples/cpp/image_generation/inpainting.bmp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:527cee8f7d451c7e5004bc58c079d4c853443644eaeb2d84a343016cd25214c1 +size 786486 diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md index 3e53f40fc4..13b4ea6ee0 100644 --- a/samples/python/image_generation/README.md +++ b/samples/python/image_generation/README.md @@ -142,4 +142,8 @@ And run the sample: `python inpainting.py ./stable-diffusion-2-inpainting 'Face of a yellow cat, high resolution, sitting on a park bench' image.png mask_image.png` +The resuling image is: + + ![](./../../cpp/image_generation/inpainting.bmp) + Note, that LoRA, heterogeneous execution and other features of `Text2ImagePipeline` are applicable for `InpaintingPipeline`. diff --git a/src/cpp/src/image_generation/schedulers/pndm.cpp b/src/cpp/src/image_generation/schedulers/pndm.cpp index a760283b97..4ddc099d0e 100644 --- a/src/cpp/src/image_generation/schedulers/pndm.cpp +++ b/src/cpp/src/image_generation/schedulers/pndm.cpp @@ -132,6 +132,14 @@ void PNDMScheduler::set_timesteps(size_t num_inference_steps, float strength) { m_ets = {}; m_counter = 0; m_cur_sample = ov::Tensor(ov::element::f32, {}); + + // apply 'strength' used in image generation + // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L711 + { + size_t init_timestep = std::min(num_inference_steps * strength, num_inference_steps); + size_t t_start = std::max(num_inference_steps - init_timestep, 0); + m_timesteps = std::vector(m_timesteps.begin() + t_start, m_timesteps.end()); + } } std::map PNDMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) { diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 8b8eb76b12..6135a187eb 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -1338,6 +1338,8 @@ class Scheduler: EULER_DISCRETE FLOW_MATCH_EULER_DISCRETE + + PNDM """ AUTO: typing.ClassVar[Scheduler.Type] # value = DDIM: typing.ClassVar[Scheduler.Type] # value = @@ -1345,7 +1347,8 @@ class Scheduler: FLOW_MATCH_EULER_DISCRETE: typing.ClassVar[Scheduler.Type] # value = LCM: typing.ClassVar[Scheduler.Type] # value = LMS_DISCRETE: typing.ClassVar[Scheduler.Type] # value = - __members__: typing.ClassVar[dict[str, Scheduler.Type]] # value = {'AUTO': , 'LCM': , 'LMS_DISCRETE': , 'DDIM': , 'EULER_DISCRETE': , 'FLOW_MATCH_EULER_DISCRETE': } + PNDM: typing.ClassVar[Scheduler.Type] # value = + __members__: typing.ClassVar[dict[str, Scheduler.Type]] # value = {'AUTO': , 'LCM': , 'LMS_DISCRETE': , 'DDIM': , 'EULER_DISCRETE': , 'FLOW_MATCH_EULER_DISCRETE': , 'PNDM': } def __eq__(self, other: typing.Any) -> bool: ... def __getstate__(self) -> int: diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index da6ce6d21b..f5347c279d 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -197,7 +197,8 @@ void init_image_generation_pipelines(py::module_& m) { .value("LMS_DISCRETE", ov::genai::Scheduler::Type::LMS_DISCRETE) .value("DDIM", ov::genai::Scheduler::Type::DDIM) .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE) - .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE); + .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE) + .value("PNDM", ov::genai::Scheduler::Type::PNDM); image_generation_scheduler.def_static("from_config", &ov::genai::Scheduler::from_config, py::arg("scheduler_config_path"), diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index 427f1c84f3..c3df84925b 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -420,7 +420,8 @@ def get_vae_decoder_step_count(self): start = time.perf_counter() scheduler_type = data.get("scheduler", ["", ""])[1] - if (scheduler_type not in ["LCMScheduler", "DDIMScheduler", "LMSDiscreteScheduler", "EulerDiscreteScheduler", "FlowMatchEulerDiscreteScheduler"]): + if (scheduler_type not in ["LCMScheduler", "DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler", "EulerDiscreteScheduler", + "FlowMatchEulerDiscreteScheduler"]): scheduler = openvino_genai.Scheduler.from_config(model_path / "scheduler/scheduler_config.json", openvino_genai.Scheduler.Type.DDIM) log.warning(f'Type of scheduler {scheduler_type} is unsupported. Please, be aware that it will be replaced to DDIMScheduler') From 79f64a6541558a66e7d55b36990b26dbcf5ebf4b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 17 Dec 2024 18:53:50 +0400 Subject: [PATCH 3/6] [Inpainting] Added single channel mask support (#1398) Current PR brings a single channel masks support (both GRAY and BINARY; GRAY is converted in BINARY anyway within mask image processor) Based on passed mask type, we dynamically select proper mask processor and convert all mask images types to BINARY. CVS-159222 --- .../image_generation/image2image_pipeline.hpp | 9 ++++++++- .../image_generation/inpainting_pipeline.hpp | 9 ++++++++- .../image_generation/text2image_pipeline.hpp | 2 +- src/cpp/src/image_generation/image_processor.cpp | 16 ++++++---------- src/cpp/src/image_generation/image_processor.hpp | 4 ++-- .../stable_diffusion_pipeline.hpp | 14 ++++++++------ 6 files changed, 33 insertions(+), 21 deletions(-) diff --git a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp index ea02969c5e..c6c1f59c88 100644 --- a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp +++ b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp @@ -67,7 +67,14 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline { return compile(device, ov::AnyMap{std::forward(properties)...}); } - // Returns a tensor with the following dimensions [num_images_per_prompt, height, width, 3] + /** + * Peforms initial image editing conditioned on a text prompt. + * @param positive_prompt Prompt to generate image(s) from + * @param initial_image RGB/BGR image of [1, height, width, 3] shape used to initialize latent image + * @param properties Image generation parameters specified as properties. Values in 'properties' override default value for generation parameters. + * @returns A tensor which has dimensions [num_images_per_prompt, height, width, 3] + * @note Output image size is the same as initial image size, but rounded down to be divisible by VAE scale factor (usually, 8) + */ ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties = {}); template diff --git a/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp index 6eead673e4..03dd9468f7 100644 --- a/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp +++ b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp @@ -89,7 +89,14 @@ class OPENVINO_GENAI_EXPORTS InpaintingPipeline { return compile(device, ov::AnyMap{std::forward(properties)...}); } - // Returns a tensor with the following dimensions [num_images_per_prompt, height, width, 3] + /** + * Inpaints an initial image within an area defined by mask and conditioned on prompt + * @param positive_prompt Prompt to generate image(s) from + * @param initial_image RGB/BGR image of [1, height, width, 3] shape used to initialize latent image + * @param mask_image RGB/BGR or GRAY/BINARY image of [1, height, width, 3 or 1] shape used as a mask + * @param properties Image generation parameters specified as properties. Values in 'properties' override default value for generation parameters. + * @returns A tensor which has dimensions [num_images_per_prompt, height, width, 3] + */ ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, const ov::AnyMap& properties = {}); template diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp index 34b9d6e341..3dc1fc0803 100644 --- a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp +++ b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp @@ -200,7 +200,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { } /** - * Generates image(s) based on prompt and other image generarion parameters + * Generates image(s) based on prompt and other image generation parameters * @param positive_prompt Prompt to generate image(s) from * @param properties Image generation parameters specified as properties. Values in 'properties' override default value for generation parameters. * @returns A tensor which has dimensions [num_images_per_prompt, height, width, 3] diff --git a/src/cpp/src/image_generation/image_processor.cpp b/src/cpp/src/image_generation/image_processor.cpp index 8c73ee2da0..1e168da33b 100644 --- a/src/cpp/src/image_generation/image_processor.cpp +++ b/src/cpp/src/image_generation/image_processor.cpp @@ -41,34 +41,30 @@ void IImageProcessor::compile(std::shared_ptr model) { m_request = utils::singleton_core().compile_model(model, m_device).create_infer_request(); } -ImageProcessor::ImageProcessor(const std::string& device, bool do_normalize, bool do_binarize) : +ImageProcessor::ImageProcessor(const std::string& device, bool do_normalize, bool do_binarize, bool gray_scale_source) : IImageProcessor(device) { auto image_processor_model = create_empty_model(); - merge_image_preprocessing(image_processor_model, do_normalize, do_binarize); + merge_image_preprocessing(image_processor_model, do_normalize, do_binarize, gray_scale_source); compile(image_processor_model); } -void ImageProcessor::merge_image_preprocessing(std::shared_ptr model, bool do_normalize, bool do_binarize) { +void ImageProcessor::merge_image_preprocessing(std::shared_ptr model, bool do_normalize, bool do_binarize, bool gray_scale_source) { OPENVINO_ASSERT(do_normalize ^ do_binarize, "Both binarize and normalize are not supported"); // https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L90-L110 ov::preprocess::PrePostProcessor ppp(model); + ov::preprocess::ColorFormat source_color_format = gray_scale_source ? ov::preprocess::ColorFormat::GRAY : ov::preprocess::ColorFormat::RGB; + ppp.input().tensor() .set_layout("NHWC") .set_element_type(ov::element::u8) - .set_color_format(ov::preprocess::ColorFormat::BGR); + .set_color_format(source_color_format); ppp.input().model() .set_layout("NCHW"); if (do_normalize) { - ppp.input().tensor().set_layout("NHWC"); - ppp.input().model().set_layout("NCHW"); - - ppp.input().tensor() - .set_element_type(ov::element::u8); - ppp.input().preprocess() .convert_layout() .convert_element_type(ov::element::f32) diff --git a/src/cpp/src/image_generation/image_processor.hpp b/src/cpp/src/image_generation/image_processor.hpp index d0ef7532aa..8c62742006 100644 --- a/src/cpp/src/image_generation/image_processor.hpp +++ b/src/cpp/src/image_generation/image_processor.hpp @@ -28,9 +28,9 @@ class IImageProcessor { class ImageProcessor : public IImageProcessor { public: - explicit ImageProcessor(const std::string& device, bool do_normalize = true, bool do_binarize = false); + explicit ImageProcessor(const std::string& device, bool do_normalize = true, bool do_binarize = false, bool gray_scale_source = false); - static void merge_image_preprocessing(std::shared_ptr model, bool do_normalize = true, bool do_binarize = false); + static void merge_image_preprocessing(std::shared_ptr model, bool do_normalize = true, bool do_binarize = false, bool gray_scale_source = false); }; class ImageResizer { diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp index 7549b67919..3801c855fd 100644 --- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp @@ -33,14 +33,15 @@ class StableDiffusionPipeline : public DiffusionPipeline { const std::string device = "CPU"; if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { - const bool do_normalize = true, do_binarize = false; - m_image_processor = std::make_shared(device, do_normalize, do_binarize); + const bool do_normalize = true, do_binarize = false, gray_scale_source = false; + m_image_processor = std::make_shared(device, do_normalize, do_binarize, gray_scale_source); m_image_resizer = std::make_shared(device, ov::element::u8, "NHWC", ov::op::v11::Interpolate::InterpolateMode::BICUBIC_PILLOW); } if (m_pipeline_type == PipelineType::INPAINTING) { - const bool do_normalize = false, do_binarize = true; - m_mask_processor = std::make_shared(device, do_normalize, do_binarize); + bool do_normalize = false, do_binarize = true; + m_mask_processor_rgb = std::make_shared(device, do_normalize, do_binarize, false); + m_mask_processor_gray = std::make_shared(device, do_normalize, do_binarize, true); m_mask_resizer = std::make_shared(device, ov::element::f32, "NCHW", ov::op::v11::Interpolate::InterpolateMode::NEAREST); } } @@ -267,7 +268,8 @@ class StableDiffusionPipeline : public DiffusionPipeline { ov::Shape target_shape = processed_image.get_shape(); ov::Tensor mask_condition = m_image_resizer->execute(mask_image, target_shape[2], target_shape[3]); - mask_condition = m_mask_processor->execute(mask_condition); + std::shared_ptr mask_processor = mask_condition.get_shape()[3] == 1 ? m_mask_processor_gray : m_mask_processor_rgb; + mask_condition = mask_processor->execute(mask_condition); // resize mask to shape of latent space ov::Tensor mask = m_mask_resizer->execute(mask_condition, target_shape[2] / vae_scale_factor, target_shape[3] / vae_scale_factor); @@ -501,7 +503,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { std::shared_ptr m_clip_text_encoder = nullptr; std::shared_ptr m_unet = nullptr; std::shared_ptr m_vae = nullptr; - std::shared_ptr m_image_processor = nullptr, m_mask_processor = nullptr; + std::shared_ptr m_image_processor = nullptr, m_mask_processor_rgb = nullptr, m_mask_processor_gray = nullptr; std::shared_ptr m_image_resizer = nullptr, m_mask_resizer = nullptr; }; From b31b6a152c3771bb92427b85cd85cc5ebd514f36 Mon Sep 17 00:00:00 2001 From: guozhong wang Date: Wed, 18 Dec 2024 01:08:54 +0800 Subject: [PATCH 4/6] Enable print properties of compiled model in genai API (#1289) When setting the environment variable OPENVINO_LOG_LEVEL > ov::log::Level::WARNING, the properties of the compiled model can be printed in genai API. When the device is CPU, the properties of the compiled model are as follows: Model: Stateful LLM model NETWORK_NAME: Model0 OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 NUM_STREAMS: 1 INFERENCE_NUM_THREADS: 48 PERF_COUNT: NO INFERENCE_PRECISION_HINT: bf16 PERFORMANCE_HINT: LATENCY EXECUTION_MODE_HINT: PERFORMANCE PERFORMANCE_HINT_NUM_REQUESTS: 0 ENABLE_CPU_PINNING: YES SCHEDULING_CORE_TYPE: ANY_CORE MODEL_DISTRIBUTION_POLICY: ENABLE_HYPER_THREADING: NO EXECUTION_DEVICES: CPU CPU_DENORMALS_OPTIMIZATION: NO LOG_LEVEL: LOG_NONE CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1 DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 KV_CACHE_PRECISION: f16 AFFINITY: CORE EXECUTION_DEVICES: CPU: Intel(R) Xeon(R) Platinum 8468 [stable_diffusion_compiled_model_log.txt](https://github.com/user-attachments/files/18120641/stable_diffusion_compiled_model_log.txt) --------- Co-authored-by: Ilya Lavrenov --- .github/workflows/llm_bench-python.yml | 1 + src/README.md | 4 ++ src/cpp/src/continuous_batching_impl.cpp | 4 +- .../models/autoencoder_kl.cpp | 2 + .../models/clip_text_model.cpp | 1 + .../clip_text_model_with_projection.cpp | 1 + .../models/flux_transformer_2d_model.cpp | 1 + .../models/sd3_transformer_2d_model.cpp | 1 + .../models/t5_encoder_model.cpp | 1 + .../models/unet_inference_dynamic.hpp | 1 + .../models/unet_inference_static_bs1.hpp | 1 + src/cpp/src/llm_pipeline.cpp | 8 +++- src/cpp/src/llm_pipeline_static.cpp | 13 +++--- src/cpp/src/lora_adapter.cpp | 4 +- src/cpp/src/tokenizer.cpp | 2 + src/cpp/src/utils.cpp | 37 ++++++++++++++++ src/cpp/src/utils.hpp | 2 + .../src/visual_language/embedding_model.cpp | 1 + .../src/visual_language/inputs_embedder.cpp | 7 +-- src/cpp/src/visual_language/pipeline.cpp | 2 +- .../src/visual_language/vision_encoder.cpp | 10 +++-- src/cpp/src/whisper_pipeline.cpp | 21 +++++---- src/cpp/src/whisper_pipeline_static.cpp | 13 ++++-- src/docs/DEBUG_LOG.md | 43 +++++++++++++++++++ 24 files changed, 152 insertions(+), 29 deletions(-) create mode 100644 src/docs/DEBUG_LOG.md diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 3d31649cea..6903882ca0 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -61,6 +61,7 @@ jobs: SRC_DIR: ${{ github.workspace }} LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark + OPENVINO_LOG_LEVEL: 3 steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/src/README.md b/src/README.md index c90bc8f4e4..6466b431d0 100644 --- a/src/README.md +++ b/src/README.md @@ -403,3 +403,7 @@ For information on how OpenVINO™ GenAI works, refer to the [How It Works Secti ## Supported Models For a list of supported models, refer to the [Supported Models Section](./docs/SUPPORTED_MODELS.md). + +## Debug Log + +For using debug log, refer to [DEBUG Log](./doc/DEBUG_LOG.md). diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 1e42f5b2d9..bf0c979d39 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -46,7 +46,9 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( const ov::AnyMap& properties, const DeviceConfig& device_config, ov::Core& core) { - ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), properties).create_infer_request(); + auto compiled_model = core.compile_model(model, device_config.get_device(), properties); + ov::genai::utils::print_compiled_model_properties(compiled_model, "LLM with Paged Attention"); + ov::InferRequest infer_request = compiled_model.create_infer_request(); // setup KV caches m_cache_manager = std::make_shared(device_config, core); diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp index e0d6a44189..d3dd7324ee 100644 --- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp +++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp @@ -212,12 +212,14 @@ AutoencoderKL& AutoencoderKL::compile(const std::string& device, const ov::AnyMa if (m_encoder_model) { ov::CompiledModel encoder_compiled_model = core.compile_model(m_encoder_model, device, properties); + ov::genai::utils::print_compiled_model_properties(encoder_compiled_model, "Auto encoder KL encoder model"); m_encoder_request = encoder_compiled_model.create_infer_request(); // release the original model m_encoder_model.reset(); } ov::CompiledModel decoder_compiled_model = core.compile_model(m_decoder_model, device, properties); + ov::genai::utils::print_compiled_model_properties(decoder_compiled_model, "Auto encoder KL decoder model"); m_decoder_request = decoder_compiled_model.create_infer_request(); // release the original model m_decoder_model.reset(); diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp index d2dab30bcf..efbc840d4f 100644 --- a/src/cpp/src/image_generation/models/clip_text_model.cpp +++ b/src/cpp/src/image_generation/models/clip_text_model.cpp @@ -97,6 +97,7 @@ CLIPTextModel& CLIPTextModel::compile(const std::string& device, const ov::AnyMa } else { compiled_model = core.compile_model(m_model, device, properties); } + ov::genai::utils::print_compiled_model_properties(compiled_model, "Clip Text model"); m_request = compiled_model.create_infer_request(); // release the original model m_model.reset(); diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp index 13c7f5a442..982800a701 100644 --- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp +++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp @@ -88,6 +88,7 @@ CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::str } else { compiled_model = core.compile_model(m_model, device, properties); } + ov::genai::utils::print_compiled_model_properties(compiled_model, "Clip Text with projection model"); m_request = compiled_model.create_infer_request(); // release the original model m_model.reset(); diff --git a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp index 6b28b116b0..b09f099655 100644 --- a/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp +++ b/src/cpp/src/image_generation/models/flux_transformer_2d_model.cpp @@ -108,6 +108,7 @@ FluxTransformer2DModel& FluxTransformer2DModel::reshape(int batch_size, FluxTransformer2DModel& FluxTransformer2DModel::compile(const std::string& device, const ov::AnyMap& properties) { OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, properties); + ov::genai::utils::print_compiled_model_properties(compiled_model, "Flux Transformer 2D model"); m_request = compiled_model.create_infer_request(); // release the original model m_model.reset(); diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp index 70dddb0476..33771f2316 100644 --- a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp +++ b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp @@ -105,6 +105,7 @@ SD3Transformer2DModel& SD3Transformer2DModel::reshape(int batch_size, SD3Transformer2DModel& SD3Transformer2DModel::compile(const std::string& device, const ov::AnyMap& properties) { OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, properties); + ov::genai::utils::print_compiled_model_properties(compiled_model, "SD3 Transformer 2D model"); m_request = compiled_model.create_infer_request(); // release the original model m_model.reset(); diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp index 8c6df34667..21df456d46 100644 --- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp +++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp @@ -63,6 +63,7 @@ T5EncoderModel& T5EncoderModel::compile(const std::string& device, const ov::Any ov::Core core = utils::singleton_core(); ov::CompiledModel compiled_model; compiled_model = core.compile_model(m_model, device, properties); + ov::genai::utils::print_compiled_model_properties(compiled_model, "T5 encoder model"); m_request = compiled_model.create_infer_request(); // release the original model m_model.reset(); diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp index c8658a1c1a..6dc285f76d 100644 --- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp @@ -20,6 +20,7 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel:: ov::Core core = utils::singleton_core(); ov::CompiledModel compiled_model = core.compile_model(model, device, properties); + ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition dynamic model"); m_request = compiled_model.create_infer_request(); } diff --git a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp index fcde31e9ee..7aa6f6301c 100644 --- a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp @@ -40,6 +40,7 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel ov::Core core = utils::singleton_core(); ov::CompiledModel compiled_model = core.compile_model(model, device, properties); + ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition batch-1 model"); for (int i = 0; i < m_native_batch_size; i++) { diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index f663b27dd9..6d9aae30fa 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -77,6 +77,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { const ov::genai::GenerationConfig& generation_config ) : LLMPipelineImplBase(tokenizer, generation_config) { ov::Core core; + ov::CompiledModel compiled_model; auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config); utils::slice_matmul_statefull_model(model); m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model); @@ -84,10 +85,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model."); m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device); // TODO: Make the prefix name configurable - m_model_runner = core.compile_model(model, device, *filtered_plugin_config).create_infer_request(); + compiled_model = core.compile_model(model, device, *filtered_plugin_config); + m_model_runner = compiled_model.create_infer_request(); } else { - m_model_runner = core.compile_model(model, device, plugin_config).create_infer_request(); + compiled_model = core.compile_model(model, device, plugin_config); + m_model_runner = compiled_model.create_infer_request(); } + ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model"); // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index cb83209b4b..090aed9650 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -777,12 +777,15 @@ void StaticLLMPipeline::setupAndCompileModels( set_npuw_cache_dir(prefill_config); set_npuw_cache_dir(generate_config); - m_kvcache_request = core.compile_model( + auto kv_compiled_model = core.compile_model( kvcache_model, device, generate_config - ).create_infer_request(); - m_prefill_request = core.compile_model( - prefill_model, device, prefill_config - ).create_infer_request(); + ); + ov::genai::utils::print_compiled_model_properties(kv_compiled_model, "Static LLM kv compiled model"); + m_kvcache_request = kv_compiled_model.create_infer_request(); + + auto prefill_compiled_model = core.compile_model(prefill_model, device, prefill_config); + m_prefill_request = prefill_compiled_model.create_infer_request(); + ov::genai::utils::print_compiled_model_properties(prefill_compiled_model, "Static LLM prefill compiled model"); } void StaticLLMPipeline::setupAndImportModels( diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp index 5e8839513e..fd446ef708 100644 --- a/src/cpp/src/lora_adapter.cpp +++ b/src/cpp/src/lora_adapter.cpp @@ -637,7 +637,9 @@ class InferRequestSignatureCache { ov::Core core = ov::genai::utils::singleton_core(); auto model = std::make_shared(request_results, request_parameters); - rwb.request = core.compile_model(model, device).create_infer_request(); + auto compiled_model = core.compile_model(model, device); + ov::genai::utils::print_compiled_model_properties(compiled_model, "Infer Request Signature Cache"); + rwb.request = compiled_model.create_infer_request(); requests.emplace(signature, rwb); } diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index cff25f07f8..642236d32a 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -203,6 +203,7 @@ class Tokenizer::TokenizerImpl { manager.register_pass(); manager.run_passes(ov_tokenizer); m_tokenizer = core.compile_model(ov_tokenizer, device, properties); + ov::genai::utils::print_compiled_model_properties(m_tokenizer, "OV Tokenizer"); m_ireq_queue_tokenizer = std::make_unique>( m_tokenizer.get_property(ov::optimal_number_of_infer_requests), @@ -216,6 +217,7 @@ class Tokenizer::TokenizerImpl { manager_detok.register_pass(); manager_detok.run_passes(ov_detokenizer); m_detokenizer = core.compile_model(ov_detokenizer, device, properties); + ov::genai::utils::print_compiled_model_properties(m_detokenizer, "OV Detokenizer"); m_ireq_queue_detokenizer = std::make_unique>( m_detokenizer.get_property(ov::optimal_number_of_infer_requests), diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 3690920295..9fa14b7f9f 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -381,6 +381,43 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t se } } +void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title) { + // Specify the name of the environment variable + const char* env_var_name = "OPENVINO_LOG_LEVEL"; + const char* env_var_value = std::getenv(env_var_name); + + // Check if the environment variable was found + if (env_var_value != nullptr && atoi(env_var_value) > static_cast(ov::log::Level::WARNING)) { + // output of the actual settings that the device selected + auto supported_properties = compiled_Model.get_property(ov::supported_properties); + std::cout << "Model: " << model_title << std::endl; + for (const auto& cfg : supported_properties) { + if (cfg == ov::supported_properties) + continue; + auto prop = compiled_Model.get_property(cfg); + if (cfg == ov::device::properties) { + auto devices_properties = prop.as(); + for (auto& item : devices_properties) { + std::cout << " " << item.first << ": " << std::endl; + for (auto& item2 : item.second.as()) { + std::cout << " " << item2.first << ": " << item2.second.as() << std::endl; + } + } + } else { + std::cout << " " << cfg << ": " << prop.as() << std::endl; + } + } + + ov::Core core; + std::vector exeTargets; + exeTargets = compiled_Model.get_property(ov::execution_devices); + std::cout << "EXECUTION_DEVICES:" << std::endl; + for (const auto& device : exeTargets) { + std::cout << " " << device << ": " << core.get_property(device, ov::device::full_name) << std::endl; + } + } +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 57728cd0dc..5342ac427c 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -104,6 +104,8 @@ size_t get_seq_len_axis(std::shared_ptr model); void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional adapter_controller); +void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title); + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/visual_language/embedding_model.cpp b/src/cpp/src/visual_language/embedding_model.cpp index 88ddfc39cd..307bdcebac 100644 --- a/src/cpp/src/visual_language/embedding_model.cpp +++ b/src/cpp/src/visual_language/embedding_model.cpp @@ -26,6 +26,7 @@ EmbeddingsModel::EmbeddingsModel(const std::filesystem::path& model_dir, merge_postprocess(m_model, scale_emb); ov::CompiledModel compiled_model = core.compile_model(m_model, device, properties); + ov::genai::utils::print_compiled_model_properties(compiled_model, "text embeddings model"); m_request = compiled_model.create_infer_request(); } diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index dfdb1521ef..cf77dfce3c 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -259,9 +259,10 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { const std::string& device, const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, model_dir, device, device_config) { - m_resampler = utils::singleton_core().compile_model( - model_dir / "openvino_resampler_model.xml", device, device_config - ).create_infer_request(); + auto compiled_model = + utils::singleton_core().compile_model(model_dir / "openvino_resampler_model.xml", device, device_config); + ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM resampler model"); + m_resampler = compiled_model.create_infer_request(); m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); } diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index b8e89a8e04..1ce0cbf210 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -92,7 +92,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { auto compiled_language_model = utils::singleton_core().compile_model( models_dir / "openvino_language_model.xml", device, properties ); - + ov::genai::utils::print_compiled_model_properties(compiled_language_model, "VLM language model"); auto language_model = compiled_language_model.get_runtime_model(); m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(language_model); diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 0842524820..9f8f9b0498 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -648,10 +648,12 @@ ov::Tensor get_pixel_values_internvl(const ov::Tensor& image, const ProcessorCon VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config) : model_type(model_type) { - m_vision_encoder = utils::singleton_core().compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); - m_processor_config = utils::from_config_json_if_exists( - model_dir, "preprocessor_config.json" - ); + auto compiled_model = utils::singleton_core().compile_model(model_dir / "openvino_vision_embeddings_model.xml", + device, + device_config); + ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM vision embeddings model"); + m_vision_encoder = compiled_model.create_infer_request(); + m_processor_config = utils::from_config_json_if_exists(model_dir, "preprocessor_config.json"); } VisionEncoder::VisionEncoder( diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp index 5c31d85fec..d472a20238 100644 --- a/src/cpp/src/whisper_pipeline.cpp +++ b/src/cpp/src/whisper_pipeline.cpp @@ -56,15 +56,18 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(properties); core.set_property(core_properties); - m_models.encoder = - core.compile_model((models_path / "openvino_encoder_model.xml").string(), device, compile_properties) - .create_infer_request(); - m_models.decoder = - core.compile_model((models_path / "openvino_decoder_model.xml").string(), device, compile_properties) - .create_infer_request(); - m_models.decoder_with_past = - core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, compile_properties) - .create_infer_request(); + ov::CompiledModel compiled_model; + compiled_model = + core.compile_model((models_path / "openvino_encoder_model.xml").string(), device, compile_properties); + ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model"); + m_models.encoder = compiled_model.create_infer_request(); + compiled_model = + core.compile_model((models_path / "openvino_decoder_model.xml").string(), device, compile_properties); + ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); + m_models.decoder = compiled_model.create_infer_request(); + compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, compile_properties); + m_models.decoder_with_past = compiled_model.create_infer_request(); + ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model"); // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) { diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp index 9937082a81..136819fa01 100644 --- a/src/cpp/src/whisper_pipeline_static.cpp +++ b/src/cpp/src/whisper_pipeline_static.cpp @@ -555,9 +555,16 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys preprocess_decoder(decoder_model); preprocess_decoder(decoder_with_past_model); - m_models.encoder = core.compile_model(encoder_model, "NPU").create_infer_request(); - m_models.decoder = core.compile_model(decoder_model, "NPU").create_infer_request(); - m_models.decoder_with_past = core.compile_model(decoder_with_past_model, "NPU").create_infer_request(); + ov::CompiledModel compiled_model; + compiled_model = core.compile_model(encoder_model, "NPU"); + ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper encoder model"); + m_models.encoder = compiled_model.create_infer_request(); + compiled_model = core.compile_model(decoder_model, "NPU"); + ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder model"); + m_models.decoder = compiled_model.create_infer_request(); + compiled_model = core.compile_model(decoder_with_past_model, "NPU"); + ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper decoder with past model"); + m_models.decoder_with_past = compiled_model.create_infer_request(); // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) { diff --git a/src/docs/DEBUG_LOG.md b/src/docs/DEBUG_LOG.md new file mode 100644 index 0000000000..5ed3f35d17 --- /dev/null +++ b/src/docs/DEBUG_LOG.md @@ -0,0 +1,43 @@ +## 1. Using Debug Log + +There are six levels of logs, which can be called explicitly or set via the ``OPENVINO_LOG_LEVEL`` environment variable: + +0 - ``ov::log::Level::NO`` +1 - ``ov::log::Level::ERR`` +2 - ``ov::log::Level::WARNING`` +3 - ``ov::log::Level::INFO`` +4 - ``ov::log::Level::DEBUG`` +5 - ``ov::log::Level::TRACE`` + +When setting the environment variable OPENVINO_LOG_LEVEL > ov::log::Level::WARNING, the properties of the compiled model can be printed. + +For example: + +Linux - export OPENVINO_LOG_LEVEL=3 +Windows - set OPENVINO_LOG_LEVEL=3 + +the properties of the compiled model are printed as follows: +```sh + NETWORK_NAME: Model0 + OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 + NUM_STREAMS: 1 + INFERENCE_NUM_THREADS: 48 + PERF_COUNT: NO + INFERENCE_PRECISION_HINT: bf16 + PERFORMANCE_HINT: LATENCY + EXECUTION_MODE_HINT: PERFORMANCE + PERFORMANCE_HINT_NUM_REQUESTS: 0 + ENABLE_CPU_PINNING: YES + SCHEDULING_CORE_TYPE: ANY_CORE + MODEL_DISTRIBUTION_POLICY: + ENABLE_HYPER_THREADING: NO + EXECUTION_DEVICES: CPU + CPU_DENORMALS_OPTIMIZATION: NO + LOG_LEVEL: LOG_NONE + CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1 + DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 + KV_CACHE_PRECISION: f16 + AFFINITY: CORE + EXECUTION_DEVICES: + CPU: Intel(R) Xeon(R) Platinum 8468 +``` \ No newline at end of file From 7d2a303270ac2c6f34754edff5611a6e8c23c854 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 18 Dec 2024 07:13:06 +0400 Subject: [PATCH 5/6] remove test models and fix order of checks (#1401) * added removing test models after llm bench tests passing to reduce disk space * fixed order of checks in wwb tests (csv file is result of successful execution of cli command, test trying to open file and only after that check cli command return code) * reduces inference counts in llm bench tests (reduces execution time in 2 times, from 72 min to 36 min) --- .github/workflows/llm_bench-python.yml | 22 ++++++++++++------- tools/llm_bench/task/image_generation.py | 11 ++++++---- .../who_what_benchmark/tests/test_cli_text.py | 9 +++----- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 6903882ca0..1999bafcfe 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -61,7 +61,6 @@ jobs: SRC_DIR: ${{ github.workspace }} LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark - OPENVINO_LOG_LEVEL: 3 steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -103,30 +102,34 @@ jobs: - name: Test native pytorch model on Linux run: | git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen - python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt + python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt -ic 20 + rm -rf tiny-random-qwen env: GIT_LFS_SKIP_SMUDGE: 0 - name: Test tiny-random-baichuan2 on Linux Optimum Intel run: | optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16 - python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum + python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum -ic 10 + rm -rf ./ov_models/tiny-random-baichuan2 - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel run: | huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7 - python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum + python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum -ic 4 - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI run: | - python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 + python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 -ic 4 - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA run: | wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591 - python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 + python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 -ic 4 + rm -rf ./ov_models/lcm_dreamshaper_v7/ - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux run: | optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16 optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8 - python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4 - python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5 + python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4 -ic 20 + python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5 -ic 20 + rm -rf ov_models/TinyLlama-1.1B-Chat-v1.0 - name: Test whisper-tiny on Linux run: | GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech @@ -138,11 +141,14 @@ jobs: optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 + rm -rf ./ov_models/whisper-tiny + rm -rf multilingual_librispeech - name: Text InternVL2-1B on Linux run: | optimum-cli export openvino --model OpenGVLab/InternVL2-1B ./ov_models/internvl2-1B --task image-text-to-text --trust-remote-code python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20 python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20 --optimum + rm -rf ./ov_models/internvl2-1B - name: WWB Tests run: | pip install git+https://github.com/huggingface/optimum-intel.git diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py index b870c7ec98..7f43afe6e2 100644 --- a/tools/llm_bench/task/image_generation.py +++ b/tools/llm_bench/task/image_generation.py @@ -25,11 +25,14 @@ stable_diffusion_hook = StableDiffusionHook() -def collects_input_args(image_param, model_type, model_name, callback=None): +def collects_input_args(image_param, model_type, model_name, infer_count=None, callback=None): input_args = {} input_args["width"] = image_param.get('width', DEFAULT_IMAGE_WIDTH) input_args["height"] = image_param.get('height', DEFAULT_IMAGE_HEIGHT) - input_args["num_inference_steps"] = image_param.get('steps', DEFAULT_INFERENCE_STEPS if 'lcm' not in model_name else LCM_DEFAULT_INFERENCE_STEPS) + if infer_count is None: + input_args["num_inference_steps"] = image_param.get('steps', DEFAULT_INFERENCE_STEPS if 'lcm' not in model_name else LCM_DEFAULT_INFERENCE_STEPS) + else: + input_args["num_inference_steps"] = infer_count guidance_scale = image_param.get('guidance_scale', None) if guidance_scale is not None: @@ -57,7 +60,7 @@ def collects_input_args(image_param, model_type, model_name, callback=None): def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None): set_seed(args['seed']) input_text = image_param['prompt'] - input_args = collects_input_args(image_param, args['model_type'], args['model_name']) + input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"]) out_str = f"Input params: Batch_size={args['batch_size']}, " \ f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}" if 'guidance_scale' in input_args: @@ -120,7 +123,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None): set_seed(args['seed']) input_text = image_param['prompt'] - input_args = collects_input_args(image_param, args['model_type'], args['model_name'], callback) + input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"], callback) out_str = f"Input params: Batch_size={args['batch_size']}, " \ f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}" if 'guidance_scale' in input_args: diff --git a/tools/who_what_benchmark/tests/test_cli_text.py b/tools/who_what_benchmark/tests/test_cli_text.py index 79335d46eb..9973cd357f 100644 --- a/tools/who_what_benchmark/tests/test_cli_text.py +++ b/tools/who_what_benchmark/tests/test_cli_text.py @@ -94,9 +94,8 @@ def test_text_gt_data(): "CPU", ] ) + assert result.returncode == 0 data = pd.read_csv(temp_file_name) - - assert result.returncode == 0 assert len(data["questions"].values) == 2 @@ -174,9 +173,8 @@ def test_text_language_autodetect(): "CPU", ] ) + assert result.returncode == 0 data = pd.read_csv(temp_file_name) - - assert result.returncode == 0 assert "马克" in data["prompts"].values[0] @@ -196,9 +194,8 @@ def test_text_hf_model(): "--hf", ] ) + assert result.returncode == 0 data = pd.read_csv(temp_file_name) - - assert result.returncode == 0 assert len(data["prompts"].values) == 2 From 9bcadf7ffdcfe5b133605847d964759593949fac Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Wed, 18 Dec 2024 08:33:47 +0400 Subject: [PATCH 6/6] [Prompt lookup] (#1245) *Description:* * Implementation of Prompt lookup decoding based on continuous batching pipeline (cb_promp_lookup_impl + prompt_lookup_impl) * Update `prompt_lookup_sample` to use new API * Update statistic to make of printing more usable *Ticket:* * https://jira.devtools.intel.com/browse/CVS-137987 *Example of usage:* * **Input:** `return 0;` * **Result Prompt lookup:** ``` =============================== Total duration, ms: 3.02267 Draft model duration, ms: 0.000724718 Main model duration, ms: 3.02195 Draft model duration, %: 0.0239761 Main model duration, %: 99.976 AVG acceptance rate, %: 10.8333 =============================== Request_id: 0 ||| 0 0 0 0 0 0 0 0 20 20 0 0 0 0 20 100 80 0 0 0 0 0 0 60 0 0 20 0 0 0 0 0 20 0 0 50 ``` * **Result Greedy:** ``` =============================== Total duration, ms: 3.18111 Draft model duration, ms: 1.538e-06 Main model duration, ms: 3.18111 Draft model duration, %: 4.83479e-05 Main model duration, %: 100 AVG acceptance rate, %: -nan =============================== ``` * **Speedup**: 100 Generated tokens: 5.24% && 300 Generated tokens: 81% (9.42 vs 5.19) --------- Co-authored-by: Ilya Lavrenov --- .github/workflows/causal_lm_cpp.yml | 31 +- .../prompt_lookup_decoding_lm/CMakeLists.txt | 21 +- .../prompt_lookup_decoding_lm.cpp | 357 ++---------------- .../speculative_decoding_lm.cpp | 1 - .../prompt_lookup_decoding_lm/README.md | 41 ++ .../prompt_lookup_decoding_lm.py | 39 ++ .../genai/continuous_batching_pipeline.hpp | 4 + .../openvino/genai/generation_config.hpp | 13 +- .../include/openvino/genai/llm_pipeline.hpp | 7 + src/cpp/src/continuous_batching_impl.cpp | 4 +- src/cpp/src/continuous_batching_impl.hpp | 3 +- src/cpp/src/continuous_batching_pipeline.cpp | 28 +- src/cpp/src/generation_config.cpp | 11 +- .../continuous_batching_for_prompt_lookup.cpp | 85 +++++ .../continuous_batching_for_prompt_lookup.hpp | 40 ++ .../src/prompt_lookup/prompt_lookup_impl.cpp | 159 ++++++++ .../src/prompt_lookup/prompt_lookup_impl.hpp | 49 +++ ...batching_for_speculative_decoding_impl.cpp | 4 +- .../speculative_decoding_impl.cpp | 23 +- .../speculative_decoding_metrics.cpp | 59 +++ .../speculative_decoding_metrics.hpp | 7 + src/python/openvino_genai/__init__.py | 2 +- .../openvino_genai/py_openvino_genai.pyi | 13 +- src/python/py_generation_config.cpp | 4 +- src/python/py_llm_pipeline.cpp | 5 +- src/python/py_openvino_genai.cpp | 1 - tests/cpp/CMakeLists.txt | 1 + 27 files changed, 606 insertions(+), 406 deletions(-) create mode 100644 samples/python/prompt_lookup_decoding_lm/README.md create mode 100755 samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py create mode 100644 src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp create mode 100644 src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp create mode 100644 src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp create mode 100644 src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 107777bf74..2e9d72e263 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -491,7 +491,6 @@ jobs: python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ./samples/requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past - name: run and compare run: | source ./ov/setupvars.sh @@ -505,36 +504,22 @@ jobs: ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt ./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt + python ./samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_py.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() with open('predictions_prompt_lookup.txt', 'r') as f: predicted_prompt_lookup = f.readline() + with open('predictions_py.txt', 'r') as f: + predicted_prompt_lookup_py = f.readline() assert predicted_greedy == predicted_prompt_lookup + assert predicted_greedy == predicted_prompt_lookup_py + assert predicted_prompt_lookup == predicted_prompt_lookup_py " echo "Prompt lookup" passed - - name: run and compare (model with seq_length_axis = 1) - run: | - source ./ov/setupvars.sh - - echo 'Code:```python - def add(a, b): - return a + b - ``` - Question: Can you please add 2 and 3 - A:' > ./prompt.txt - - ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./Qwen-7B-Chat/ "$( predictions_prompt_lookup.txt - ./build/samples/cpp/text_generation/greedy_causal_lm ./Qwen-7B-Chat/ "$( predictions_greedy.txt - python -c " - with open('predictions_greedy.txt', 'r') as f: - predicted_greedy = f.readline() - with open('predictions_prompt_lookup.txt', 'r') as f: - predicted_prompt_lookup = f.readline() - assert predicted_greedy == predicted_prompt_lookup - " - echo "Prompt lookup" passed - + env: + PYTHONPATH: "./build/:$PYTHONPATH" + LD_LIBRARY_PATH: "./build/openvino_genai/:$LD_LIBRARY_PATH" cpp-Phi-1_5: runs-on: ubuntu-20.04-16-cores defaults: diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt index c899c6e47b..b0ce8b1b60 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -1,8 +1,6 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) - find_package(OpenVINOGenAI REQUIRED PATHS "${CMAKE_BINARY_DIR}" # Reuse the package from the build. @@ -10,21 +8,16 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) -add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) -target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime openvino::threading) -set_target_properties(prompt_lookup_decoding_lm PROPERTIES - COMPILE_PDB_NAME prompt_lookup_decoding_lm +set(TARGET_NAME prompt_lookup_decoding_lm) +add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) +target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai) + +set_target_properties(${TARGET_NAME} PROPERTIES + COMPILE_PDB_NAME ${TARGET_NAME} # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17) - -get_target_property(genai_imported openvino::genai IMPORTED_LOCATION) -set(OPENVINO_TOKENIZERS_PATH $,${genai_imported},$>) -set(OPENVINO_TOKENIZERS_FILENAME "${CMAKE_SHARED_LIBRARY_PREFIX}openvino_tokenizers${CMAKE_SHARED_LIBRARY_SUFFIX}") -target_compile_definitions(prompt_lookup_decoding_lm PRIVATE - OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}/${OPENVINO_TOKENIZERS_FILENAME}") -install(TARGETS prompt_lookup_decoding_lm +install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 282220a4b1..e692110027 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -1,338 +1,45 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include #include -#include -namespace { - -// only batch_size = 1 currently supported -constexpr size_t BATCH_SIZE = 1; - -size_t get_seq_len_axis(std::shared_ptr model) { - // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], - // therefore usually seq_length_axis = 2 - size_t seq_length_axis = 2; - - // "ReadValue" node is KV cache representation in stateful model - std::string kv_node_type_name = std::string(ov::op::v6::ReadValue::get_type_info_static().name); - - for (const auto op : model->get_ops()) { - if (op->get_type_name() != kv_node_type_name) { - continue; - } - - // Shape example: [-1,4,0,64] - auto shape = op->get_input_partial_shape(0); - - for (size_t i = 0; i < shape.rank().get_length(); i++) { - // Find axis = 0. This would be sequence length axis. - if (shape[i] == 0) { - seq_length_axis = i; - } - } - break; - } - - return seq_length_axis; -} - -std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { - tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); - tokenizer.infer(); - return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; -} - -std::string detokenize(ov::InferRequest& detokenizer, std::vector& tokens) { - detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()}); - detokenizer.infer(); - return detokenizer.get_output_tensor().data()[0]; -} - -// The following reasons require TextStreamer to keep a cache of previous tokens: -// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a", -// but detokenize(tokenize("prefix a")) == "prefix a" -// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�" -struct TextStreamer { - ov::InferRequest detokenizer; - std::vector token_cache; - size_t print_len = 0; - - void put(int64_t token) { - token_cache.push_back(token); - std::string text = detokenize(detokenizer, token_cache); - if (!text.empty() && '\n' == text.back() && text.size() > print_len) { - // Flush the cache after the new line symbol - std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; - token_cache.clear(); - print_len = 0; - return; - } - constexpr char replacement[] = "\xef\xbf\xbd"; // MSVC with /utf-8 fails to compile � directly with newline in string literal error. - if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) { - // Don't print incomplete text - return; - } else if (text.size() > print_len) { - // It is possible to have a shorter text after adding new token. - // Print to output only if text length is increaeseds. - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; - print_len = text.size(); - } - } - - void end() { - std::string text = detokenize(detokenizer, token_cache); - if (text.size() <= print_len) - return; - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; - token_cache.clear(); - print_len = 0; - } -}; - -ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) { - // Copy elements from the old to a new tensor and return it. - // Trim kv tensor on sequence length axis - // key/values tensor shape example: [BATCH_SIZE, num_kv_heads, seq_len, head_size] - // Sequence length axis position may vary from one model to another - - auto shape = tensor.get_shape(); - - OPENVINO_ASSERT(seq_len_axis < shape.size(), - "Sequence length axis: ", - seq_len_axis, - " should be less than shape size: ", - shape.size()); - - size_t old_seq_len = shape[seq_len_axis]; - - OPENVINO_ASSERT(new_seq_len <= old_seq_len); - - // if new_seq_len equal to old one no need to copy tensor, return as is - if (old_seq_len == new_seq_len) - return tensor; - - shape[seq_len_axis] = new_seq_len; - - if (seq_len_axis == 0) { - tensor.set_shape(shape); - return tensor; - } - - ov::Coordinate new_shape_begin{0, 0, 0, 0}; - ov::Coordinate new_shape_end{shape}; - - auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); - - return new_tensor; -} - -void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) { - // trim kv_cache values up to the new_seq_len - auto states = request.query_state(); - ov::parallel_for(states.size(), [&](size_t i) { - ov::Tensor old_tensor = states.at(i).get_state(); - states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); - }); -} - -class PromptLookupCandidateGenerator { -private: - const size_t max_ngram_size = 3; - size_t num_pred_tokens = 5; - const size_t max_pred_tokens = 20; - -public: - PromptLookupCandidateGenerator(const size_t max_ngram_size, const size_t num_pred_tokens) - : max_ngram_size{max_ngram_size}, - num_pred_tokens{num_pred_tokens} {}; - - std::vector generate_candidates(const std::vector& input_ids) { - const size_t input_length = input_ids.size(); - - for (int32_t ngram_size = max_ngram_size; ngram_size > 0; ngram_size--) { - // extract last ngram_size tokens as search ngram - std::vector ngram = std::vector{input_ids.cend() - ngram_size, input_ids.cend()}; - - // find ngram match in input_ids - size_t ngram_i = 0; - for (size_t input_i = 0; input_i < input_length - ngram_size; input_i++) { - if (ngram[ngram_i] != input_ids[input_i]) { - ngram_i = 0; - continue; - } - - ngram_i++; - - if (ngram_i < ngram_size) { - continue; - } - - // match found with the end at input_i - size_t avaliable_num_pred = std::min(input_length - (input_i + 1), num_pred_tokens); - - // return candidates with length of avaliable_num_pred - return std::vector{input_ids.cbegin() + input_i + 1, - input_ids.cbegin() + input_i + 1 + avaliable_num_pred}; - } - } - - return std::vector{}; - } - - void update_candidate_strategy(const size_t num_matches) { - // dynamically adjust number of generated candidates based on number of matches - // we want to balance the benefits of getting assistant tokens correct with the - // cost of forecasting incorrect assistant tokens. - if (num_matches == num_pred_tokens) { - num_pred_tokens = std::min(num_pred_tokens + 2, max_pred_tokens); - } else { - num_pred_tokens = std::max(num_pred_tokens - 1, size_t(1)); - } - } -}; - -int64_t get_eos_token(const std::shared_ptr tokenizer) { - auto rt_info = tokenizer->get_rt_info(); // Get the runtime info for the model - - auto it = rt_info.find("eos_token_id"); - if (it == rt_info.end()) { - throw std::runtime_error("EOS token ID not found in model's runtime information."); - } - return it->second.as(); -} - -} // namespace +#include "openvino/genai/llm_pipeline.hpp" int main(int argc, char* argv[]) try { - if (argc != 3) { + if (3 != argc) { throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); } - // tokenizer model - ov::Core core; - core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - - const std::string model_dir = std::string{argv[1]}; - - auto tokenizer_model = core.read_model(model_dir + "/openvino_tokenizer.xml"); - // tokenizer and detokenizer work on CPU only - ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); - auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); - - std::vector full_input_ids{input_ids.data(), input_ids.data() + input_ids.get_size()}; - - ov::InferRequest detokenizer = - core.compile_model(model_dir + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - TextStreamer text_streamer{std::move(detokenizer)}; - - std::shared_ptr ov_model = core.read_model(model_dir + "/openvino_model.xml"); - - size_t seq_len_axis = get_seq_len_axis(ov_model); - - ov::InferRequest model = core.compile_model(ov_model, "CPU").create_infer_request(); - - model.set_tensor("input_ids", input_ids); - model.set_tensor("attention_mask", attention_mask); - - ov::Tensor position_ids = model.get_tensor("position_ids"); - position_ids.set_shape(input_ids.get_shape()); - std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); - size_t seq_len = input_ids.get_shape()[1]; - - // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1 - model.get_tensor("beam_idx").set_shape({BATCH_SIZE}); - model.get_tensor("beam_idx").data()[0] = 0; - - // To collect kv-cache for the and to get the next token run the very first infer request - model.infer(); - - // logits shape is [BATCH_SIZE, seq_len, vocab_size] - auto logits = model.get_tensor("logits"); - size_t vocab_size = logits.get_shape().back(); - auto data_logits = logits.data() + (seq_len - 1) * vocab_size; - int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits; - - full_input_ids.push_back(out_token); - - auto first_token = out_token; - text_streamer.put(out_token); - - const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); - - // Prompt lookup decoding is a speculative decoding technique where the draft model replaced - // with string matching in the prompt to generate candidate token sequences. - int max_sequence_length = 100; - PromptLookupCandidateGenerator candidateGenerator{3, 5}; - - while (out_token != EOS_TOKEN && seq_len < max_sequence_length) { - auto candidates = candidateGenerator.generate_candidates(full_input_ids); - - // cut redundant candidates on last iteration - size_t tokens_to_generate = max_sequence_length - seq_len; - candidates.resize(std::min(candidates.size(), tokens_to_generate - 1)); - size_t candidates_size = candidates.size(); - - // candidates_size + 1 tokens will be fed at once in a single infer request. - input_ids.set_shape({BATCH_SIZE, candidates_size + 1}); - input_ids.data()[0] = first_token; - std::copy_n(candidates.begin(), candidates_size, input_ids.data() + 1); - - attention_mask.set_shape({BATCH_SIZE, seq_len + candidates_size + 1}); - std::fill_n(attention_mask.data(), attention_mask.get_size(), 1); - - position_ids.set_shape({BATCH_SIZE, candidates_size + 1}); - std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), seq_len); - - model.infer(); - - data_logits = logits.data(); // [BATCH_SIZE, 1 + candidates_size, vocab_size] - - // 1. accept current out token (if not eos) - // 2. check if it matches appropriate candidate - // 2.1 if it's match, continue - accept next token - // 2.2 it it's mismatch, stop iteration but still accept current token as it was last token generated by - // model from a valid sequence. - size_t accepted_tokens_number = 0; - for (size_t i = 0; i < candidates_size + 1; i++) { - auto start = data_logits + vocab_size * i; - auto stop = data_logits + vocab_size * (i + 1); - out_token = std::max_element(start, stop) - start; - - if (out_token == EOS_TOKEN) { - break; - } - - text_streamer.put(out_token); - full_input_ids.push_back(out_token); - accepted_tokens_number++; - - if (i == candidates_size || out_token != candidates[i]) { - break; - } - } - - if (accepted_tokens_number > 0) { - candidateGenerator.update_candidate_strategy(accepted_tokens_number - 1); - } - - // After the inference request, key/values have shape [BATCH_SIZE, seq_len + candidates_size, vocab_size]. - // Increment the sequence length by the number of matched tokens, and - // trim the KV cache to match the new sequence length. - seq_len += accepted_tokens_number; - update_kv_cache(model, seq_len_axis, seq_len); - - first_token = out_token; - } - - text_streamer.end(); - // Model is stateful which means that context (kv-cache) which belongs to a particular - // text sequence is accumulated inside the model during the generation loop above. - // This context should be reset before processing the next text sequence. - // While it is not required to reset context in this sample as only one sequence is processed, - // it is called for education purposes: - model.reset_state(); + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + // Define candidates number for candidate generation + config.num_assistant_tokens = 5; + // Define max_ngram_size + config.max_ngram_size = 3; + + std::string model_path = argv[1]; + std::string prompt = argv[2]; + + std::string device = "CPU"; + + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.cache_size = 5; + + ov::genai::LLMPipeline pipe( + model_path, + device, + ov::genai::prompt_lookup(true), + ov::genai::scheduler_config(scheduler_config)); + + auto streamer = [](std::string subword) { + std::cout << subword << std::flush; + return false; + }; + + // Since the streamer is set, the results will + // be printed each time a new token is generated. + pipe.generate(prompt, config, streamer); + std::cout << std::endl; } catch (const std::exception& error) { try { std::cerr << error.what() << '\n'; diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index dc6761879c..487296566b 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -29,7 +29,6 @@ int main(int argc, char* argv[]) try { ov::genai::SchedulerConfig scheduler_config; scheduler_config.cache_size = 5; - // Different devices require different block sizes, so different scheduler configs need to be set. ov::genai::LLMPipeline pipe( main_model_path, main_device, diff --git a/samples/python/prompt_lookup_decoding_lm/README.md b/samples/python/prompt_lookup_decoding_lm/README.md new file mode 100644 index 0000000000..1e5f4003d4 --- /dev/null +++ b/samples/python/prompt_lookup_decoding_lm/README.md @@ -0,0 +1,41 @@ +# prompt_lookup_decoding_lm Python sample that supports most popular models like LLaMA 3 + +[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. + +```sh +source /setupvars.sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + +`python prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py new file mode 100755 index 0000000000..557897b6b1 --- /dev/null +++ b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai + +def streamer(subword): + print(subword, end='', flush=True) + # Return flag corresponds whether generation should be stopped. + # False means continue generation. + return False + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('prompt') + args = parser.parse_args() + + device = 'CPU' + scheduler_config = openvino_genai.SchedulerConfig() + # cache params + scheduler_config.cache_size = 2 + + pipe = openvino_genai.LLMPipeline(args.model_dir, device, scheduler_config=scheduler_config, prompt_lookup=True) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + # add parameter to enable prompt lookup decoding to generate `num_assistant_tokens` candidates per iteration + config.num_assistant_tokens = 5 + # Define max_ngram_size + config.max_ngram_size = 3 + + # Since the streamer is set, the results will be printed + # every time a new token is generated and put into the streamer queue. + pipe.generate(args.prompt, config, streamer) + +if '__main__' == __name__: + main() diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 4a0637f2d9..74466ee488 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -55,10 +55,14 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { class ImplInterface; class ContinuousBatchingImpl; class ContinuousBatchingForSpeculativeDecodingImpl; + class ContinuousBatchingForPromptLookupImpl; class SpeculativeDecodingImpl; + class PromptLookupImpl; friend class ContinuousBatchingForSpeculativeDecodingImpl; + friend class ContinuousBatchingForPromptLookupImpl; friend class SpeculativeDecodingImpl; + friend class PromptLookupImpl; std::shared_ptr m_impl; diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 9d79240aa8..b8b222e347 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -71,9 +71,10 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER }; * @param frequency_penalty reduces absolute log prob as many times as the token was generated. * @param rng_seed initializes random generator. * - * Speculative decoding parameters: - * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of static strategy candidates number update. - * @param num_assistant_tokens the defined candidates number to be generated by draft model in case of dynamic strategy candidates number update. + * Assisting generation parameters: + * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of dynamic strategy candidates number update. + * @param num_assistant_tokens the defined candidates number to be generated by draft model/prompt lookup in case of static strategy candidates number update. + * @param max_ngram_size is maximum ngram to use when looking for matches in the prompt. */ class OPENVINO_GENAI_EXPORTS GenerationConfig { @@ -114,9 +115,10 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { float frequency_penalty = 0.0f; size_t rng_seed = 0; - // Speculative decoding + // Assisting generation parameters float assistant_confidence_threshold = 0.f; size_t num_assistant_tokens = 0; + size_t max_ngram_size = 0; // EOS special token int64_t eos_token_id = -1; @@ -132,7 +134,10 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { bool is_greedy_decoding() const; bool is_beam_search() const; bool is_multinomial() const; + OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2025.0.0 release") bool is_speculative_decoding() const; + bool is_assisting_generation() const; + bool is_prompt_lookup() const; void update_generation_config(const ov::AnyMap& config_map); template diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 44427d45b1..948baab6f4 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -320,5 +320,12 @@ inline std::pair draft_model( */ static constexpr ov::Property scheduler_config{"scheduler_config"}; +/** +* @brief enable prompt_lookup property serves to activate prompt lookup decoding. +* Set `true` to activate this mode. +* And create LLMPipeline instance with this config. +*/ +static constexpr ov::Property prompt_lookup{"prompt_lookup"}; + } // namespace genai } // namespace ov diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index bf0c979d39..6e7e982a4c 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -16,10 +16,12 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& properties, - const ov::genai::GenerationConfig& generation_config + const ov::genai::GenerationConfig& generation_config, + bool is_validation_mode_enabled ) { m_tokenizer = tokenizer; m_generation_config = generation_config; + m_is_validation_mode_enabled = is_validation_mode_enabled; ov::Core core; diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 780bff6a31..8da05c6dfa 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -58,7 +58,8 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& properties, - const ov::genai::GenerationConfig& generation_config); + const ov::genai::GenerationConfig& generation_config, + bool is_validation_mode_enabled = false); GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index 2faad4354e..148eb2fa9f 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -11,6 +11,7 @@ #include "openvino/genai/tokenizer.hpp" #include "continuous_batching_impl.hpp" #include "speculative_decoding/speculative_decoding_impl.hpp" +#include "prompt_lookup/prompt_lookup_impl.hpp" #include "timer.hpp" #include "utils.hpp" #include "debug_utils.hpp" @@ -28,6 +29,15 @@ extract_draft_model_from_config(ov::AnyMap& config) { return draft_model; } +inline bool +extract_prompt_lookup_from_config(ov::AnyMap& config) { + bool res = false; + if (config.find(ov::genai::prompt_lookup.name()) != config.end()) { + res = config.at(ov::genai::prompt_lookup.name()).as(); + config.erase(ov::genai::prompt_lookup.name()); + } + return res; +} ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, @@ -36,12 +46,16 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p const ov::AnyMap& tokenizer_properties) { auto properties_without_draft_model = properties; auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); + auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model); std::filesystem::path openvino_model_name = "openvino_model.xml"; auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string()); auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties); auto generation_config = utils::from_config_json_if_exists(models_path); - if (draft_model_desr.model == nullptr) { + if (is_prompt_lookup_enabled) { + OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded"); + m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config); + } else if (draft_model_desr.model == nullptr) { m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } else { auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); @@ -57,11 +71,15 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const ov::AnyMap& properties) { auto properties_without_draft_model = properties; auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); + auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model); std::filesystem::path openvino_model_name = "openvino_model.xml"; auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string()); auto generation_config = utils::from_config_json_if_exists(models_path); - if (draft_model_desr.model == nullptr) { + if (is_prompt_lookup_enabled) { + OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded"); + m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config); + } else if (draft_model_desr.model == nullptr) { m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } else { auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); @@ -79,9 +97,13 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const ov::genai::GenerationConfig& generation_config) { auto properties_without_draft_model = properties; auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); + auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model); auto model = utils::singleton_core().read_model(model_str, weights_tensor); - if (draft_model_desr.model == nullptr) { + if (is_prompt_lookup_enabled) { + OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually excluded"); + m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties_without_draft_model, generation_config); + } else if (draft_model_desr.model == nullptr) { m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } else { auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 189cfeded7..35ae92d605 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -132,9 +132,17 @@ bool GenerationConfig::is_multinomial() const { } bool GenerationConfig::is_speculative_decoding() const { + return is_assisting_generation(); +} + +bool GenerationConfig::is_assisting_generation() const { return (assistant_confidence_threshold > 0 || num_assistant_tokens > 0); } +bool GenerationConfig::is_prompt_lookup() const { + return (max_ngram_size > 0 && num_assistant_tokens > 0); +} + void GenerationConfig::validate() const { OPENVINO_ASSERT(eos_token_id == -1 || stop_token_ids.find(eos_token_id) != stop_token_ids.end(), "'stop_token_ids' must contain 'eos_token_id'. Please, call 'set_eos_token_id' with 'eos_token_id' value"); @@ -181,9 +189,10 @@ void GenerationConfig::validate() const { OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); } - if (is_speculative_decoding()) { + if (is_assisting_generation()) { if (assistant_confidence_threshold != 0.f) { OPENVINO_ASSERT(num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`"); + OPENVINO_ASSERT(!is_prompt_lookup(), "Parameters `assistant_confidence_threshold` cannot be used while Prompt Lookup decoding"); } else { OPENVINO_ASSERT(num_assistant_tokens > 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`"); }; diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp new file mode 100644 index 0000000000..8c9e520728 --- /dev/null +++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "continuous_batching_for_prompt_lookup.hpp" + +namespace ov::genai { + +std::map +ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::get_generated_request_len() { + std::map result; + for (const auto& request : m_requests) { + const auto request_id = request->get_request_id(); + auto validation_len = request->get_num_tokens_to_validate(); + auto generated_len = request->get_num_processed_tokens() - request->get_prompt_len() + 1; + result.insert({ request_id, { generated_len, validation_len } }); + } + return result; +} + +TokenIds ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::generate_candidates(const TokenIds& input_ids, size_t num_pred_tokens, size_t max_ngram_size) { + if (num_pred_tokens == 0) { + return std::vector{}; + } + + const size_t input_length = input_ids.size(); + + for (int32_t ngram_size = max_ngram_size; ngram_size > 0; ngram_size--) { + // extract last ngram_size tokens as search ngram + std::vector ngram = std::vector{input_ids.cend() - ngram_size, input_ids.cend()}; + + // find ngram match in input_ids + size_t ngram_i = 0; + for (size_t input_i = 0; input_i < input_length - ngram_size; input_i++) { + if (ngram[ngram_i] != input_ids[input_i]) { + ngram_i = 0; + continue; + } + + ngram_i++; + + if (ngram_i < ngram_size) { + continue; + } + + // match found with the end at input_i + size_t avaliable_num_pred = std::min(input_length - (input_i + 1), num_pred_tokens); + + // return candidates with length of avaliable_num_pred + return std::vector{input_ids.cbegin() + input_i + 1, + input_ids.cbegin() + input_i + 1 + avaliable_num_pred}; + } + } + + return std::vector{}; +} + +void ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::generate_candidates() { + for (auto& request : m_requests) { + const auto prompt = request->get_prompt_ids(); + size_t max_validation_len = 0; + for (auto& running_sequence : request->get_running_sequences()) { + const auto generated_tokens = running_sequence->get_generated_ids(); + TokenIds full_input_ids = prompt; + full_input_ids.insert(full_input_ids.end(), generated_tokens.begin(), generated_tokens.end()); + + size_t min_num_assistant_tokens = 0; + const auto sampling_params = request->get_sampling_parameters(); + { + const auto generated_len = running_sequence->get_generated_len(); + const auto left_generated_len = std::min(sampling_params.max_new_tokens, sampling_params.max_length) - generated_len - 1; + min_num_assistant_tokens = std::min(sampling_params.num_assistant_tokens, left_generated_len); + } + TokenIds candidates = generate_candidates(full_input_ids, min_num_assistant_tokens, sampling_params.max_ngram_size); + + if (!candidates.empty()) { + for (const auto& candidate : candidates) { + running_sequence->append_token(candidate, 0); + } + max_validation_len = std::max(max_validation_len, candidates.size()); + } + } + request->set_num_validated_tokens(max_validation_len); + } +} +} \ No newline at end of file diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp new file mode 100644 index 0000000000..8962aba0f2 --- /dev/null +++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/continuous_batching_pipeline.hpp" + +#include "continuous_batching_impl.hpp" + +namespace ov::genai { +class ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl : public ContinuousBatchingPipeline::ContinuousBatchingImpl { +public: + ContinuousBatchingForPromptLookupImpl() = default; + + ContinuousBatchingForPromptLookupImpl( + const std::shared_ptr& model, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config, + bool is_validation_mode_enabled = false) : + ContinuousBatchingImpl{ model, + tokenizer, + scheduler_config, + device, + properties, + generation_config, + true } {}; + + void generate_candidates(); + + // { generated_len, validation_len } + using SequenceLen = std::pair; + std::map get_generated_request_len(); + +protected: + TokenIds generate_candidates(const TokenIds& input_ids, size_t num_pred_tokens, size_t max_ngram_size); +}; +} \ No newline at end of file diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp new file mode 100644 index 0000000000..f934a56939 --- /dev/null +++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp @@ -0,0 +1,159 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "prompt_lookup_impl.hpp" +#include "text_callback_streamer.hpp" + +namespace ov::genai { +template struct overloaded : Ts... {using Ts::operator()...;}; +template overloaded(Ts...) -> overloaded; + +GenerationHandle +ContinuousBatchingPipeline::PromptLookupImpl::add_request(uint64_t request_id, + const ov::Tensor& input_ids, + ov::genai::GenerationConfig sampling_params) { + OPENVINO_ASSERT(sampling_params.is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`"); + return m_pipeline->add_request(request_id, input_ids, sampling_params); +}; + +GenerationHandle +ContinuousBatchingPipeline::PromptLookupImpl::add_request(uint64_t request_id, + const std::string& prompt, + ov::genai::GenerationConfig sampling_params) { + OPENVINO_ASSERT(sampling_params.is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`"); + return m_pipeline->add_request(request_id, prompt, sampling_params); +} + +bool ContinuousBatchingPipeline::PromptLookupImpl::has_non_finished_requests() { + return m_pipeline->has_non_finished_requests(); +} + +void ContinuousBatchingPipeline::PromptLookupImpl::step() { + ManualTimer candidates_timer("prompt_lookup_decoding: generate_candidates()"); + candidates_timer.start(); + m_pipeline->generate_candidates(); + candidates_timer.end(); + m_sd_metrics.draft_duration += candidates_timer.get_duration(); + auto generated_len_before = m_pipeline->get_generated_request_len(); + + ManualTimer main_timer("prompt_lookup_decoding: step()"); + main_timer.start(); + m_pipeline->step(); + main_timer.end(); + m_sd_metrics.main_duration += main_timer.get_duration(); + m_pipeline_metrics = m_pipeline->get_metrics(); + auto generated_len_after = m_pipeline->get_generated_request_len(); + + for (const auto request : generated_len_before) { + auto request_id = request.first; + auto prev_validation_len = request.second.second; + if (prev_validation_len == 0) { + continue; + } + size_t num_matches = prev_validation_len; + float acceptance_rate = 1.f; + if (generated_len_after.count(request.first)) { + auto present_req_len = generated_len_after.at(request.first).first; + auto prev_full_req_len = request.second.first; + + num_matches = (present_req_len - prev_full_req_len - 1); + acceptance_rate = static_cast(num_matches) / static_cast(prev_validation_len); + } + m_sd_metrics.update_acceptance_rate(request_id, acceptance_rate * 100); + m_sd_metrics.update_draft_accepted_tokens(request_id, num_matches); + } + + if (generated_len_after.empty() && 0) { + m_sd_metrics.print(true); + m_sd_metrics.clean_up(); + } +} + +std::vector +ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector& input_ids, + const std::vector& sampling_params, + const StreamerVariant& streamer) { + ManualTimer generate_timer("speculative_decoding: generate()"); + generate_timer.start(); + OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request"); + OPENVINO_ASSERT(input_ids.size() == sampling_params.size()); + const std::shared_ptr& streamer_ptr = std::visit(overloaded{ + [](std::monostate) -> std::shared_ptr { + return nullptr; + }, + [](const std::shared_ptr& streamer) { + return streamer; + }, + [this](const std::function& streamer) -> std::shared_ptr { + return std::make_unique(m_tokenizer, streamer); + } + }, streamer); + + OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && (sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()), + "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding"); + + std::vector main_generations; + for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) { + OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); + OPENVINO_ASSERT(sampling_params[request_id].is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`"); + main_generations.push_back(m_pipeline->add_request(request_id, input_ids[request_id], sampling_params[request_id])); + } + + std::vector results; + results.reserve(input_ids.size()); + + bool continue_generation = true; + while (has_non_finished_requests() && continue_generation) { + step(); + if (streamer_ptr) { + // not generated tokens like several prompt phase + if (!main_generations.at(0).get()->can_read()) { + continue; + } + std::unordered_map token = main_generations.at(0).get()->back(); + OPENVINO_ASSERT(1 <= token.size()); + OPENVINO_ASSERT(1 <= token.begin()->second.generated_ids.size()); + for (const auto& gen_token : token.begin()->second.generated_ids) { + continue_generation = !streamer_ptr->put(gen_token); + if (!continue_generation) { + break; + } + } + } + } + if (streamer_ptr) { + streamer_ptr->end(); + } + + for (size_t generation_idx = 0; generation_idx < main_generations.size(); ++generation_idx) { + const auto& generation = main_generations[generation_idx]; + EncodedGenerationResult result; + result.m_request_id = 1; + std::vector generation_outputs = generation->read_all(); + std::sort(generation_outputs.begin(), generation_outputs.end(), [=] (GenerationOutput& r1, GenerationOutput& r2) { + return r1.score > r2.score; + }); + + auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size()); + for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) { + const auto& generation_output = generation_outputs[generation_output_idx]; + m_sd_metrics.set_generated_len(generation_idx, generation_outputs[generation_output_idx].generated_ids.size()); + result.m_generation_ids.push_back(std::move(generation_output.generated_ids)); + result.m_scores.push_back(generation_output.score); + } + result.m_status = generation->get_status(); + results.push_back(std::move(result)); + } + + OPENVINO_ASSERT(results.size() == input_ids.size()); + generate_timer.end(); + m_sd_metrics.total_duration = generate_timer.get_duration(); + + return results; +} + +SpeculativeDecodingMetrics +ContinuousBatchingPipeline::PromptLookupImpl::get_metrics() { + return m_sd_metrics; +}; +} diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp new file mode 100644 index 0000000000..dae721741b --- /dev/null +++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "continuous_batching_impl.hpp" +#include "continuous_batching_for_prompt_lookup.hpp" +#include "speculative_decoding/speculative_decoding_metrics.hpp" +#include "utils.hpp" + +namespace ov::genai { + +class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPipeline::ImplInterface { +protected: + std::shared_ptr m_pipeline; + SpeculativeDecodingMetrics m_sd_metrics; + +public: + PromptLookupImpl(const std::shared_ptr& model, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config) { + m_tokenizer = tokenizer; + m_pipeline = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); + }; + + GenerationHandle add_request(uint64_t request_id, + const ov::Tensor& input_ids, + ov::genai::GenerationConfig sampling_params) override; + GenerationHandle add_request(uint64_t request_id, + const std::string& prompt, + ov::genai::GenerationConfig sampling_params) override; + + bool has_non_finished_requests() override; + + void step() override; + + std::vector + generate(const std::vector& input_ids, + const std::vector& sampling_params, + const StreamerVariant& streamer) override; + + SpeculativeDecodingMetrics get_metrics(); +}; + +} \ No newline at end of file diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp index 06a51b38be..36f274f30f 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp @@ -141,7 +141,7 @@ init_request( LogitProcessor& logit_processor, bool is_update_logit_processor, bool is_init_all_sequences_in_request = false) { - OPENVINO_ASSERT(request->get_sampling_parameters().is_speculative_decoding(), + OPENVINO_ASSERT(request->get_sampling_parameters().is_assisting_generation(), "Speculative decoding should have initialized options `assistant_confidence_threshold` xor `num_assistant_tokens` in `GenerationConfig`."); if (candidates.begin()->second.token_ids.empty() && !is_init_all_sequences_in_request) { return 0; @@ -303,7 +303,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::m to_generate = false; for (auto& request : m_requests) { const auto& sampling_params = request->get_sampling_parameters(); - if (!sampling_params.is_speculative_decoding()) { + if (!sampling_params.is_assisting_generation()) { // generate only one token in case of non speculative decoding request->pause_generation(true); } else if (request->get_num_processed_tokens() >= request->get_prompt_len() && diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index e4f3b1ad1f..4a0748b5c0 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -182,6 +182,11 @@ void ContinuousBatchingPipeline::SpeculativeDecodingImpl::step() { m_sd_metrics.update_acceptance_rate(request_id, acceptance_rate * 100); m_sd_metrics.update_draft_accepted_tokens(request_id, (updated_seq_info.inserted_tokens_cnt - updated_seq_info.removed_tokens_cnt)); } + + if (main_generated_requests.empty() && 0) { + m_sd_metrics.print(true); + m_sd_metrics.clean_up(); + } } std::vector @@ -266,24 +271,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< OPENVINO_ASSERT(results.size() == input_ids.size()); generate_timer.end(); - m_sd_metrics.total_duration = generate_timer.get_duration(); - - // Print Speculative decoding metrics - if (0) { - std::cout << std::endl; - std::cout << "Total duration, ms: " << m_sd_metrics.total_duration << std::endl; - std::cout << "Draft model duration, ms: " << m_sd_metrics.draft_duration << std::endl; - std::cout << "Main model duration, ms: " << m_sd_metrics.main_duration << std::endl; - std::cout << "Draft model duration, %: " << m_sd_metrics.get_draft_duration_percentage() << std::endl; - std::cout << "Main model duration, %: " << m_sd_metrics.get_main_duration_percentage() << std::endl; - std::cout << "Main model iterations: " << m_sd_metrics.get_iteration_number(0) << std::endl; - std::cout << "Token per sec: " << float(sampling_params[0].max_new_tokens) / m_sd_metrics.total_duration << std::endl; - std::cout << "AVG acceptance rate, %: " << m_sd_metrics.get_avg_acceptance_rate(0) << std::endl; - std::cout << "Accepted tokens by draft model: " << m_sd_metrics.get_draft_accepted_tokens_counter(0) << std::endl; - std::cout << "Generated tokens: " << sampling_params[0].max_new_tokens << std::endl; - std::cout << "Accepted token rate, %: " << m_sd_metrics.get_draft_accepted_tokens_percentage(0) << std::endl; - } - return results; } diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp index 42d3f0b750..4e5602482a 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp @@ -95,4 +95,63 @@ void SpeculativeDecodingMetrics::set_generated_len(int64_t request_id, size_t ge m_generated_len.insert({ request_id, generated_len }); } +size_t SpeculativeDecodingMetrics::get_generated_len(int64_t request_id) { + return m_generated_len.at(request_id); +} + +std::vector SpeculativeDecodingMetrics::get_requests_id() { + std::vector result; + for (const auto& req : m_generated_len) { + result.push_back(req.first); + } + return result; +} + +void SpeculativeDecodingMetrics::print_acceptance_rates() { + for (const auto& a : m_acceptance_rate) { + std::cout << "Request_id: " << a.first << " ||| "; + for (const auto& b : a.second) { + std::cout << b << " "; + } + std::cout << std::endl; + } +} + +void SpeculativeDecodingMetrics::print(bool is_printing_per_request) { + if (total_duration == 0) { + total_duration = draft_duration + main_duration; + } + std::cout << "\n=============================== " << std::endl; + std::cout << "Total duration, ms: " << total_duration << std::endl; + std::cout << "Draft model duration, ms: " << draft_duration << std::endl; + std::cout << "Main model duration, ms: " << main_duration << std::endl; + std::cout << "Draft model duration, %: " << get_draft_duration_percentage() << std::endl; + std::cout << "Main model duration, %: " << get_main_duration_percentage() << std::endl; + std::cout << "AVG acceptance rate, %: " << get_avg_acceptance_rate(-1) << std::endl; + std::cout << "=============================== " << std::endl; + if (is_printing_per_request) { + for (const auto& i : get_requests_id()) { + std::cout << "REQUEST_ID: " << i << std::endl; + std::cout << "Main model iterations: " << get_iteration_number(i) << std::endl; + std::cout << "Token per sec: " << float(get_generated_len(i)) / total_duration << std::endl; + std::cout << "AVG acceptance rate, %: " << get_avg_acceptance_rate(i) << std::endl; + std::cout << "Accepted tokens by draft model: " << get_draft_accepted_tokens_counter(i) << std::endl; + std::cout << "Generated tokens: " << get_generated_len(i) << std::endl; + std::cout << "Accepted token rate, %: " << get_draft_accepted_tokens_percentage(i) << std::endl; + std::cout << "=============================== " << std::endl; + } + print_acceptance_rates(); + } + +} + +void SpeculativeDecodingMetrics::clean_up() { + m_acceptance_rate.clear(); + m_draft_accepted_tokens.clear(); + m_generated_len.clear(); + draft_duration = 0; + main_duration = 0; + total_duration = 0; +} + } \ No newline at end of file diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp index 5256128277..0d9173b99f 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp @@ -28,6 +28,7 @@ class SpeculativeDecodingMetrics { void update_draft_accepted_tokens(int64_t request_id, size_t num_matches); void set_generated_len(int64_t request_id, size_t generated_len); + size_t get_generated_len(int64_t request_id); size_t get_iteration_number(int64_t request_id); @@ -35,5 +36,11 @@ class SpeculativeDecodingMetrics { float get_main_duration_percentage(); float get_inference_duration_percentage(); + std::vector get_requests_id(); + + void print_acceptance_rates(); + void print(bool is_printing_per_request = false); + + void clean_up(); }; } \ No newline at end of file diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index 470ddd0cd8..a0b0faf58c 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -28,7 +28,7 @@ # LLM pipeline from .py_openvino_genai import ( LLMPipeline, - draft_model + draft_model, ) # LoRA diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 6135a187eb..524ff0f921 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -575,6 +575,7 @@ class GenerationConfig: logprobs: int max_length: int max_new_tokens: int + max_ngram_size: int min_new_tokens: int no_repeat_ngram_size: int num_assistant_tokens: int @@ -598,11 +599,13 @@ class GenerationConfig: @typing.overload def __init__(self, **kwargs) -> None: ... + def is_assisting_generation(self) -> bool: + ... def is_beam_search(self) -> bool: ... def is_greedy_decoding(self) -> bool: ... - def is_speculative_decoding(self) -> bool: + def is_prompt_lookup(self) -> bool: ... def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None: ... @@ -2122,11 +2125,7 @@ class WhisperRawPerfMetrics: @property def features_extraction_durations(self) -> list[float]: ... -class draft_model: +def draft_model(models_path: os.PathLike, device: str = '', **kwargs) -> openvino._pyopenvino.OVAny: """ - This class is used to enable Speculative Decoding + device on which inference will be performed """ - def __init__(self, models_path: os.PathLike, device: str = '', **kwargs) -> None: - """ - device on which inference will be performed - """ diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index d24a915dd6..b1a5c6cd2e 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -107,12 +107,14 @@ void init_generation_config(py::module_& m) { .def_readwrite("logprobs", &GenerationConfig::logprobs) .def_readwrite("assistant_confidence_threshold", &GenerationConfig::assistant_confidence_threshold) .def_readwrite("num_assistant_tokens", &GenerationConfig::num_assistant_tokens) + .def_readwrite("max_ngram_size", &GenerationConfig::max_ngram_size) .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output) .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids) .def_readwrite("adapters", &GenerationConfig::adapters) .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) - .def("is_speculative_decoding", &GenerationConfig::is_speculative_decoding) + .def("is_assisting_generation", &GenerationConfig::is_assisting_generation) + .def("is_prompt_lookup", &GenerationConfig::is_prompt_lookup) .def("update_generation_config", static_cast(&ov::genai::GenerationConfig::update_generation_config), py::arg("config_map")); } diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp index b53cc56f10..b1d5136253 100644 --- a/src/python/py_llm_pipeline.cpp +++ b/src/python/py_llm_pipeline.cpp @@ -195,15 +195,14 @@ void init_llm_pipeline(py::module_& m) { .def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy) .def("set_generation_config", &LLMPipeline::set_generation_config, py::arg("config")); - py::class_(m, "draft_model", py::module_local(), "This class is used to enable Speculative Decoding") - .def(py::init([]( + m.def("draft_model", []( const std::filesystem::path& models_path, const std::string& device, const py::kwargs& kwargs ) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); return draft_model(models_path, device, pyutils::kwargs_to_any_map(kwargs)).second; - }), + }, py::arg("models_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", py::arg("device") = "", "device on which inference will be performed"); } diff --git a/src/python/py_openvino_genai.cpp b/src/python/py_openvino_genai.cpp index e821c1cfdc..429f48f30d 100644 --- a/src/python/py_openvino_genai.cpp +++ b/src/python/py_openvino_genai.cpp @@ -21,7 +21,6 @@ using ov::genai::DecodedResults; using ov::genai::EncodedResults; using ov::genai::StreamerBase; using ov::genai::StringInputs; -using ov::genai::draft_model; void init_lora_adapter(py::module_& m); void init_perf_metrics(py::module_& m); diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index f404e63cff..093cd993de 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -19,6 +19,7 @@ file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/cache_eviction.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sampler.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/speculative_decoding/*.cpp" + "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/prompt_lookup/*.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils/*.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/continuous_batching*.cpp"