From 8bef5a317ebf1d9817a191d64dda7902ac787ab3 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 4 Dec 2024 08:58:52 +0400 Subject: [PATCH] Port from master (#1285) - https://github.com/openvinotoolkit/openvino.genai/pull/1158 - https://github.com/openvinotoolkit/openvino.genai/pull/1178 - https://github.com/openvinotoolkit/openvino.genai/pull/1214 - https://github.com/openvinotoolkit/openvino.genai/pull/1243 - https://github.com/openvinotoolkit/openvino.genai/pull/1253 - https://github.com/openvinotoolkit/openvino.genai/pull/1259 - https://github.com/openvinotoolkit/openvino.genai/pull/1266 - https://github.com/openvinotoolkit/openvino.genai/pull/1271 - https://github.com/openvinotoolkit/openvino.genai/pull/1278 - https://github.com/openvinotoolkit/openvino.genai/pull/1280 - https://github.com/openvinotoolkit/openvino.genai/pull/1284 - e4a86f615336d583534ae9ab2970c98985e78cfd - https://github.com/openvinotoolkit/openvino.genai/pull/1246 - https://github.com/openvinotoolkit/openvino.genai/pull/958 --------- Co-authored-by: Anastasiia Pnevskaia Co-authored-by: Helena Kloosterman Co-authored-by: Vladimir Zlobin Co-authored-by: Dmitry Matveev Co-authored-by: Anna Likholat Co-authored-by: Alina Kladieva --- .github/labeler.yml | 2 +- .github/workflows/windows.yml | 4 +- CMakeLists.txt | 35 +- README.md | 25 +- pyproject.toml | 29 +- requirements-build.txt | 3 +- src/cpp/CMakeLists.txt | 15 + .../genai/visual_language/pipeline.hpp | 13 + src/cpp/src/block_manager.hpp | 20 +- src/cpp/src/continuous_batching_impl.cpp | 22 +- .../stable_diffusion_xl_pipeline.hpp | 11 +- src/cpp/src/llm_pipeline.cpp | 4 +- src/cpp/src/llm_pipeline_static.cpp | 32 +- src/cpp/src/llm_pipeline_static.hpp | 4 - src/cpp/src/sequence_group.hpp | 3 + .../speculative_decoding_impl.cpp | 2 +- src/cpp/src/utils.cpp | 2 +- src/cpp/src/utils.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 9 + src/cpp/src/whisper_pipeline.cpp | 2 +- src/python/CMakeLists.txt | 143 +- src/python/compare_pyi.cmake | 28 + src/python/openvino_genai/__init__.pyi | 38 + .../openvino_genai/py_openvino_genai.pyi | 1684 +++++++++++++++++ .../py_continuous_batching_pipeline.cpp | 92 +- src/python/py_generation_config.cpp | 12 +- src/python/py_image_generation_models.cpp | 65 +- src/python/py_image_generation_pipelines.cpp | 60 +- src/python/py_llm_pipeline.cpp | 10 +- src/python/py_lora_adapter.cpp | 22 +- src/python/py_openvino_genai.cpp | 9 +- src/python/py_perf_metrics.cpp | 4 +- src/python/py_tokenizer.cpp | 6 +- src/python/py_vlm_pipeline.cpp | 33 +- src/python/py_whisper_pipeline.cpp | 8 +- 35 files changed, 2245 insertions(+), 208 deletions(-) create mode 100644 src/python/compare_pyi.cmake create mode 100644 src/python/openvino_genai/__init__.pyi create mode 100644 src/python/openvino_genai/py_openvino_genai.pyi diff --git a/.github/labeler.yml b/.github/labeler.yml index d4d0ac0965..c5d0db312c 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -36,7 +36,7 @@ - 'tests/cpp/generate_config.cpp' - 'tests/cpp/sampler.cpp' -- 'category: LoRA': +'category: LoRA': - 'src/cpp/include/openvino/genai/lora_adapter.hpp' - 'src/cpp/src/lora_adapter.cpp' - 'src/cpp/src/lora_helper.cpp' diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 33096d6d7b..70bac20af0 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -243,7 +243,7 @@ jobs: - name: Test bindings (wheel) run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - python -m pip install . --verbose + python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" genai_python_lib_whisper: @@ -307,7 +307,7 @@ jobs: - name: Test bindings (wheel) run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - python -m pip install . --verbose + python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" genai_python_lib_vlm: diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c6e56d427..18a3600c65 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,26 +25,45 @@ if(POLICY CMP0169) cmake_policy(SET CMP0169 OLD) endif() +if(UNIX AND NOT (APPLE OR ANDROID OR CYGWIN)) + set(LINUX ON) +endif() + project(OpenVINOGenAI VERSION 2024.5.0.0 DESCRIPTION "OpenVINO GenAI" HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai" LANGUAGES CXX C) +if(NOT DEFINED Python3_FIND_VIRTUALENV) + set(Python3_FIND_VIRTUALENV FIRST) +endif() + +# Looking for OpenVINO in the python distribution. It doesn't work for cross-compiling build +if(NOT CMAKE_CROSSCOMPILING) + find_package(Python3 REQUIRED) + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "from openvino.utils import get_cmake_path; print(get_cmake_path(), end='')" + OUTPUT_VARIABLE OpenVINO_DIR_PY + ERROR_QUIET + ) +endif() + # Find OpenVINODeveloperPackage first to compile with SDL flags find_package(OpenVINODeveloperPackage ${OpenVINOGenAI_VERSION} QUIET COMPONENTS Runtime Threading PATHS "${OpenVINO_DIR}") if(NOT OpenVINODeveloperPackage_FOUND) find_package(OpenVINO ${OpenVINOGenAI_VERSION} REQUIRED - COMPONENTS Runtime Threading) + COMPONENTS Runtime Threading + PATHS "${OpenVINO_DIR_PY}") endif() include(cmake/features.cmake) if(ENABLE_PYTHON) # the following two calls are required for cross-compilation - if(OpenVINODeveloperPackage_DIR) + if(OpenVINODeveloperPackage_FOUND) ov_find_python3(REQUIRED) ov_detect_python_module_extension() else() @@ -62,9 +81,15 @@ endif() add_subdirectory(thirdparty) add_subdirectory(src) -add_subdirectory(samples) -add_subdirectory(tools/continuous_batching) -add_subdirectory(tests/cpp) +if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/samples") + add_subdirectory(samples) +endif() +if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/tools/continuous_batching") + add_subdirectory(tools/continuous_batching) +endif() +if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/tests/cpp") + add_subdirectory(tests/cpp) +endif() install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) diff --git a/README.md b/README.md index fe18205028..c1217a0215 100644 --- a/README.md +++ b/README.md @@ -117,17 +117,34 @@ optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code -- ### Run generation using VLMPipeline API in Python +See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/visual_language_chat) for a demo application. + +Run the following command to download a sample image: + +```sh +curl -O "https://storage.openvinotoolkit.org/test_data/images/dog.jpg" +``` + ```python +import numpy as np +import openvino as ov import openvino_genai as ov_genai -#Will run model on CPU, GPU is a possible option +from PIL import Image + +# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU pipe = ov_genai.VLMPipeline("./MiniCPM-V-2_6/", "CPU") -rgb = read_image("cat.jpg") -print(pipe.generate(prompt, image=rgb, max_new_tokens=100)) + +image = Image.open("dog.jpg") +image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8) +image_data = ov.Tensor(image_data) + +prompt = "Can you describe the image?" +print(pipe.generate(prompt, image=image_data, max_new_tokens=100)) ``` ### Run generation using VLMPipeline in C++ -Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details) +Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details). See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/visual_language_chat) for a demo application. ```cpp #include "load_image.hpp" diff --git a/pyproject.toml b/pyproject.toml index 154c8f9a3e..3e3ec90beb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,16 +3,31 @@ name = "openvino-genai" version = "2024.5.0.0" description = "Library of the most popular Generative AI model pipelines, optimized execution methods, and samples" requires-python = ">=3.9" -readme = {file = "src/README.md", content-type="text/markdown"} -license = {text = "OSI Approved :: Apache Software License"} +readme = { file = "src/README.md", content-type="text/markdown" } +license = { "file" = "LICENSE" } authors = [ { name = "OpenVINO Developers", email = "openvino@intel.com" }, ] classifiers = [ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Operating System :: Unix", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Operating System :: MacOS", + "Programming Language :: C++", + "Programming Language :: C", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: Implementation :: CPython" ] dependencies = [ "openvino_tokenizers~=2024.5.0.0.dev" @@ -22,22 +37,24 @@ dependencies = [ directory = "src/python" [tool.py-build-cmake.sdist] -exclude = ["tools", "samples", "tests", "thirdparty"] +include = ["CMakeLists.txt", "LICENSE", "third-party-programs.txt", "SECURITY.md", "cmake", "src", "thirdparty"] [tool.py-build-cmake.cmake] minimum_version = "3.23" build_type = "Release" config = ["Release"] find_python3 = true -build_args = ["--parallel", "--target", "py_openvino_genai"] +build_args = ["--parallel", "--target", "py_openvino_genai_stub"] install_args = ["--strip"] install_components = ["wheel_genai"] options = {"BUILD_TOKENIZERS" = "OFF"} [build-system] requires = [ - "py-build-cmake@git+https://github.com/tttapa/py-build-cmake@7ab73da351c7140f06d727a8705bece4cf544cd9", - "cmake~=3.23" + "py-build-cmake==0.3.1", + "pybind11-stubgen==2.5.1", + "openvino~=2024.5.0.0.dev", + "cmake~=3.23.0" ] build-backend = "py_build_cmake.build" diff --git a/requirements-build.txt b/requirements-build.txt index 2611a89b08..6da3919e91 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1 +1,2 @@ -cmake~=3.30 \ No newline at end of file +cmake~=3.23.0 +pybind11-stubgen==2.5.1 \ No newline at end of file diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 6a18bc969c..d02f32ded9 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -96,6 +96,21 @@ else() SOVERSION ${MAJOR_SUFFIX}${OpenVINOGenAI_VERSION_MINOR}${OpenVINOGenAI_VERSION_PATCH}) endif() +if(OpenVINODeveloperPackage_FOUND) + # must be called after all target_link_libraries + # ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) + + ov_ncc_naming_style(FOR_TARGET ${TARGET_NAME} + SOURCE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/include") + + # TODO: override versions as currently they come from OpenVINO + # ov_add_vs_version_file(NAME ${TARGET_NAME} + # FILEDESCRIPTION "OpenVINO GenAI library") + + # TODO: commit changes separately + # ov_add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}) +endif() + # - Windows: `\runtime\bin\intel64\Release\` # - MacOS_x86: `/runtime/lib/intel64/Release` # - MacOS_arm64: `/runtime/lib/arm64/Release/` diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 7db731d57c..b9b6ef92e8 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -99,6 +99,19 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { const StreamerVariant& streamer ); + /// @brief Generate a response given a prompt and uint8 RGB image with [NHWC] or [HWC] layout. + /// @param prompt A prompt to respond to. + /// @param image Image to be prepended to a prompt. + /// @param generation_config A config to follow for text generation. + /// @param streamer A streamer to acquire intermediate result. + /// @return A string generated by a model. + DecodedResults generate( + const std::string& prompt, + const ov::Tensor& rgb, + const GenerationConfig& generation_config, + const StreamerVariant& streamer + ); + /// @brief Generate a response given a prompt and config. /// @param prompt A prompt to respond to. /// @param config_map A config may contain GenerationConfig, values diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp index c96c17bd15..dc82897dc8 100644 --- a/src/cpp/src/block_manager.hpp +++ b/src/cpp/src/block_manager.hpp @@ -12,7 +12,6 @@ #include "sequence_group.hpp" - namespace ov::genai { class KVCacheBlock { @@ -188,7 +187,10 @@ class CacheStateDumper; */ class BlockAllocator { std::vector> m_free_blocks; - int m_total_num_blocks; + // We keep m_free_blocks_num instead of m_free_blocks[X].size() to WA old CXX library implementation issue for std::list::size() + // see https://stackoverflow.com/questions/13157164/why-isnt-stdlist-size-constant-time + std::vector m_free_blocks_num; + size_t m_total_num_blocks; friend class CacheStateDumper; size_t m_num_layers; bool m_enable_prefix_caching; @@ -202,8 +204,8 @@ class BlockAllocator { * @param num_layers The number of separate attention layers with KV caches in the LLM associated with the pipeline. * Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache. */ - BlockAllocator(int num_blocks, bool enable_prefix_caching, size_t num_layers = 1) : - m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) { + BlockAllocator(size_t num_blocks, bool enable_prefix_caching, size_t num_layers = 1) : + m_free_blocks_num(num_layers, num_blocks), m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) { OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); m_free_blocks.resize(m_num_layers); for (auto& per_layer_block_list : m_free_blocks) { @@ -224,7 +226,7 @@ class BlockAllocator { * @return Number of free blocks for this layer. */ size_t num_free_blocks(size_t layer_idx) const { - return m_free_blocks[layer_idx].size() + m_overwriteable_blocks.num_blocks(); + return m_free_blocks_num[layer_idx] + num_overwriteable_blocks(); } /** @@ -270,6 +272,7 @@ class BlockAllocator { block_ptr->release(); if (block_ptr->is_free()) { m_free_blocks[layer_idx].push_back(block_ptr); + ++m_free_blocks_num[layer_idx]; } } @@ -325,6 +328,7 @@ class BlockAllocator { // actual collision case for (size_t layer_idx = 0; layer_idx < colliding_blocks_per_layer.size(); layer_idx++) { m_free_blocks[layer_idx].push_back(colliding_blocks_per_layer[layer_idx]); + ++m_free_blocks_num[layer_idx]; } } m_overwriteable_blocks.add(blocks_for_all_layers); @@ -333,12 +337,14 @@ class BlockAllocator { // TODO (vshampor): more fine-grained hash store control for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) { m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]); + ++m_free_blocks_num[layer_idx]; } } } else { for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) { m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]); + ++m_free_blocks_num[layer_idx]; } } } @@ -368,6 +374,7 @@ class BlockAllocator { KVCacheBlock::Ptr allocated_block = m_free_blocks[layer_idx].front(); allocated_block->increment(); m_free_blocks[layer_idx].pop_front(); + --m_free_blocks_num[layer_idx]; return allocated_block; } @@ -386,7 +393,7 @@ class BlockAllocator { OPENVINO_ASSERT(m_enable_prefix_caching); OPENVINO_ASSERT(can_allocate_blocks(1)); - if (m_free_blocks[0].size() > 0) { + if (m_free_blocks_num[0] > 0) { // allocate new empty block BlocksPerLayer allocated_blocks; allocated_blocks.reserve(m_num_layers); @@ -396,6 +403,7 @@ class BlockAllocator { allocated_block->set_hash(hash); allocated_blocks.push_back(allocated_block); m_free_blocks[i].pop_front(); + --m_free_blocks_num[i]; } cached_blocks[hash] = allocated_blocks; return allocated_blocks; diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 916167b63b..c38f54bacf 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -21,7 +21,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( ov::Core core; - auto [core_properties, compile_properties] = utils::split_core_complile_config(properties); + auto [core_properties, compile_properties] = utils::split_core_compile_config(properties); core.set_property(core_properties); // The model can be compiled for GPU as well @@ -57,7 +57,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( } SchedulerConfig updated_config = scheduler_config; - // update KV number in scheduler config + // update KV blocks number in scheduler config if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) { updated_config.num_kv_blocks = device_config.get_num_kv_blocks(); } @@ -166,24 +166,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() { timer.start(); logits = m_model_runner->forward(m_requests, scheduler_output); timer.end(); - - ov::InferRequest infer_request = m_model_runner->get_infer_request(); - ov::CompiledModel compiled_model = infer_request.get_compiled_model(); - const bool is_profiling_enabled = compiled_model.get_property(ov::enable_profiling); - - // collect detailed statistic - if (is_profiling_enabled) { - std::vector profiling_info = m_model_runner->get_infer_request().get_profiling_info(); - for (const ov::ProfilingInfo& info : profiling_info) { - double current_time = info.real_time.count(); - if (info.node_type == "PagedAttentionExtension") { - m_perf.m_paged_attention_time_ms += current_time; - } else if (info.node_type == "FullyConnected") { - m_perf.m_matmul_time_ms += current_time; - } - m_perf.m_infer_total_ms += current_time; - } - } } #ifdef DEBUG_CACHE_STATE_DUMP diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp index b709c58f47..af40b5cfa2 100644 --- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp @@ -111,12 +111,19 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { OPENVINO_THROW("Unsupported '", unet, "' UNet type"); } + // Temporary fix for GPU + ov::AnyMap updated_roperties = properties; + if (device.find("GPU") != std::string::npos && + updated_roperties.find("INFERENCE_PRECISION_HINT") == updated_roperties.end()) { + updated_roperties["INFERENCE_PRECISION_HINT"] = ov::element::f32; + } + const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) - m_vae = std::make_shared(root_dir / "vae_decoder", device, properties); + m_vae = std::make_shared(root_dir / "vae_decoder", device, updated_roperties); else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { - m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder", device, properties); + m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder", device, updated_roperties); } else { OPENVINO_ASSERT("Unsupported pipeline type"); } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 15a6ee4a12..8e83983bb1 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -63,7 +63,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { { ov::Core core; if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config); + auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(*filtered_plugin_config); core.set_property(core_plugin_config); auto model = core.read_model(models_path / "openvino_model.xml"); m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model."); @@ -71,7 +71,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { utils::slice_matmul_statefull_model(model); m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); } else { - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); + auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(plugin_config); core.set_property(core_plugin_config); auto model = core.read_model(models_path / "openvino_model.xml"); utils::slice_matmul_statefull_model(model); diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 701d2eca28..6e2ecc9ea7 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -698,45 +698,45 @@ void StaticLLMPipeline::setupAndCompileModels( // NB: Get information about NPU if available auto npudesc = extract_npu_descriptor(core); // (1) Read the template model - this will be kvcache model - m_kvcache_model = core.read_model((models_path / "openvino_model.xml").string()); + auto kvcache_model = core.read_model((models_path / "openvino_model.xml").string()); // (2) Expose KV-cache input and output layers from kvcache model - ov::pass::StatefulToStateless().run_on_model(m_kvcache_model); + ov::pass::StatefulToStateless().run_on_model(kvcache_model); // (3) Align u4 ZP constants - align_u4_zp_constants(m_kvcache_model); + align_u4_zp_constants(kvcache_model); // (4) Clone the model - this will be prefill - m_prefill_model = m_kvcache_model->clone(); - m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); + auto prefill_model = kvcache_model->clone(); + prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill"); // (5) Reshape both models to static shape const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u); const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u); ModelDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); KVAxesPosition axes = get_kv_axes(model_desc.type); m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len, false}; - reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); - reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); + reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); + reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes); // (6) Apply opt layout if applicable // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" || (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) { - if (optimize_value_tensors(m_kvcache_model)) { + if (optimize_value_tensors(kvcache_model)) { // NB: Check if TransposeValueTensors transformation was applied m_kvcache_desc.v_tensors_transposed = true; - m_prefill_model = cvt_value_tensors_layout(m_prefill_model); + prefill_model = cvt_value_tensors_layout(prefill_model); } } // (7) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat) - m_kvcache_model = redirect_new_kv_to_output(m_kvcache_model); + kvcache_model = redirect_new_kv_to_output(kvcache_model); // (8) Convert kvcache tensors to fp16 precision - m_kvcache_model = cvt_kvcache_to_fp16(m_kvcache_model); - m_prefill_model = cvt_kvcache_to_fp16(m_prefill_model); + kvcache_model = cvt_kvcache_to_fp16(kvcache_model); + prefill_model = cvt_kvcache_to_fp16(prefill_model); // (9) Compile both model auto prefill_config = pop_or_default( - properties, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model, npudesc) + properties, "PREFILL_CONFIG", get_default_prefill_config(prefill_model, npudesc) ); // NB: GENERATE_HINT is only applicable for default generate config! auto generate_hint = str_to_hint(pop_or_default(properties, "GENERATE_HINT", to_string(GenerateHint::FAST_COMPILE))); auto generate_config = pop_or_default( - properties, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model, npudesc, generate_hint) + properties, "GENERATE_CONFIG", get_default_generate_config(kvcache_model, npudesc, generate_hint) ); merge_config_with(prefill_config, properties); merge_config_with(generate_config, properties); @@ -745,10 +745,10 @@ void StaticLLMPipeline::setupAndCompileModels( set_npuw_cache_dir(generate_config); m_kvcache_request = core.compile_model( - m_kvcache_model, device, generate_config + kvcache_model, device, generate_config ).create_infer_request(); m_prefill_request = core.compile_model( - m_prefill_model, device, prefill_config + prefill_model, device, prefill_config ).create_infer_request(); } diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 2f9969f5d7..d8e59d867a 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -61,10 +61,6 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { bool v_tensors_transposed; }; - // FIXME: Ideally, we don't need to keep those - std::shared_ptr m_kvcache_model; - std::shared_ptr m_prefill_model; - KVCacheDesc m_kvcache_desc; ov::InferRequest m_kvcache_request; ov::InferRequest m_prefill_request; diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index c5be82f0f2..6755255fe8 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -477,6 +477,9 @@ class SequenceGroup { } void clear_waiting_sequences() { + if (!is_waiting()) + return; + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { if (m_sequences[seq_id]->is_waiting()) { m_sequences[seq_id]->set_status(SequenceStatus::RUNNING); diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index 0f43555a5f..4e43fdadc9 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -31,7 +31,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl( const ov::genai::ModelDesc draft_model_desc, const ov::AnyMap& tokenizer_properties) { ov::Core core; - auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_properties); + auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(main_properties); core.set_property(core_properties); std::filesystem::path openvino_model_name = "openvino_model.xml", diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index bc3bef8e0c..e042ead293 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -205,7 +205,7 @@ ProcessorConfig from_any_map( * There are not supported by `core.compile` function plugin options like `ENABLE_MMAP` * Move this options to `core.set_property` config */ -std::pair split_core_complile_config(const ov::AnyMap& properties) { +std::pair split_core_compile_config(const ov::AnyMap& properties) { const std::vector unsupported_by_compile_properties{"ENABLE_MMAP"}; ov::AnyMap core_properties; ov::AnyMap compile_properties{properties}; diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index bbb51174a3..2ab2bff0b4 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -64,7 +64,7 @@ ProcessorConfig from_any_map( const ProcessorConfig& initial ); -std::pair split_core_complile_config(const ov::AnyMap& properties); +std::pair split_core_compile_config(const ov::AnyMap& properties); ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 427a5c9229..4062e39da2 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -305,6 +305,15 @@ DecodedResults VLMPipeline::generate( return m_pimpl->generate(prompt, rgbs, generation_config, streamer); } +DecodedResults VLMPipeline::generate( + const std::string& prompt, + const ov::Tensor& rgb, + const GenerationConfig& generation_config, + const StreamerVariant& streamer +) { + return m_pimpl->generate(prompt, {rgb}, generation_config, streamer); +} + DecodedResults VLMPipeline::generate( const std::string& prompt, const ov::AnyMap& config_map diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp index c0e486018a..6d08da1b14 100644 --- a/src/cpp/src/whisper_pipeline.cpp +++ b/src/cpp/src/whisper_pipeline.cpp @@ -39,7 +39,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi const ov::AnyMap& properties) : WhisperPipelineImplBase{models_path} { ov::Core core = utils::singleton_core(); - auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(properties); + auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(properties); core.set_property(core_properties); m_models.encoder = diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index d74331a021..75a2fd59a7 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -20,18 +20,28 @@ endif() file(GLOB python_sources "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp") -pybind11_add_module(py_openvino_genai ${python_sources}) -target_link_libraries(py_openvino_genai PRIVATE openvino::genai) -target_include_directories(py_openvino_genai PRIVATE "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src") # for tokenizers_path.hpp -set_target_properties(py_openvino_genai PROPERTIES +set(TARGET_NAME py_openvino_genai) +pybind11_add_module(${TARGET_NAME} ${python_sources}) + +target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai) +target_include_directories(${TARGET_NAME} PRIVATE "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src") # for tokenizers_path.hpp +set_target_properties(${TARGET_NAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" ) -file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" DESTINATION "${CMAKE_BINARY_DIR}/openvino_genai/") +file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" + "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.pyi" + "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/py_openvino_genai.pyi" + DESTINATION "${CMAKE_BINARY_DIR}/openvino_genai/") configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/__version__.py.in" "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" @ONLY) +if(OpenVINODeveloperPackage_FOUND) + # TODO: commit changes separately + # ov_add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}) +endif() + if(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME) # RPATH for wheel is mandatory to find openvino_genai library. It # must be forced because GenAI may be built with OpenVINO targeting @@ -53,14 +63,16 @@ elseif(APPLE) endif() if(rpaths) - set_target_properties(py_openvino_genai PROPERTIES INSTALL_RPATH "${rpaths}") + set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "${rpaths}") endif() install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" + "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.pyi" + "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/py_openvino_genai.pyi" "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" DESTINATION python/openvino_genai COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR}) -install(TARGETS py_openvino_genai +install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION python/openvino_genai COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR}) @@ -78,6 +90,121 @@ install(FILES "${OpenVINOGenAI_SOURCE_DIR}/LICENSE" # wheel_genai component is used for wheel generation in pyproject.toml. # Exclude wheel_genai from normal packaging because there's pygenai_X_Y component for that. -install(TARGETS openvino_genai py_openvino_genai +install(TARGETS openvino_genai ${TARGET_NAME} LIBRARY DESTINATION openvino_genai COMPONENT wheel_genai EXCLUDE_FROM_ALL RUNTIME DESTINATION openvino_genai COMPONENT wheel_genai EXCLUDE_FROM_ALL) + +# Generate or check .pyi stub files generated by pybind11-stub + +set(pyproject_toml "${OpenVINOGenAI_SOURCE_DIR}/pyproject.toml") +file(STRINGS ${pyproject_toml} pybind11_stubgen_dep REGEX "pybind11-stubgen") + +if(pybind11_stubgen_dep MATCHES "pybind11-stubgen==[0-9\.]+") + set(pybind11_stubgen_dep "${CMAKE_MATCH_0}") +else() + message(FATAL_ERROR "Internal error: failed to parse pybind11-stubgen version from from '${pyproject_toml}'") +endif() + +if(OpenVINODeveloperPackage_FOUND) + ov_check_pip_package(REQUIREMENT ${pybind11_stubgen_dep} + RESULT_VAR pybind11_stubgen_AVAILABLE + WARNING_MESSAGE "Please, install ${pybind11_stubgen_dep} if you plan to develop Python OpenVINO GenAI API" + MESSAGE_MODE WARNING) +elseif(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND NOT WIN32) + # in case of wheel build, pybind11-stubgen is always available via pyproject.toml's build-system + # except Win32 where we have issues with pybind11_stubgen executable which cannot import its own module + set(pybind11_stubgen_AVAILABLE ON) + + # by default, wheel build is performed with build-isolation, which means that some variables like PYTHONPATH + # are not available. But if user called setupvars.sh, then OpenVINO dir is available, while PYTHONPATH - no. + # In this case, we will have mismatch on Linux when OpenVINO can point on build dir / install dir, while + # PYTHONPATH points out to locally installed tmp OpenVINO wheel (build against wheel). + # Ways to handle it: + # - setting PYTHONPATH to $ENV{INTEL_OPENVINO_DIR}/python if INTEL_OPENVINO_DIR is defined. It means we are building against + # OpenVINO archive or installation tree + # - if it's not defined, we cannot do any guesses and hence, disable pybind11-stubgen usage + if(DEFINED ENV{INTEL_OPENVINO_DIR}) + set(openvino_pythonpath "$ENV{INTEL_OPENVINO_DIR}/python") + elseif(LINUX AND NOT OpenVINO_DIR STREQUAL OpenVINO_DIR_PY) + # here we imply that OpenVINO_DIR_PY points to manylinux, while OpenVINO_DIR point to Ubuntu binaries + set(pybind11_stubgen_AVAILABLE OFF) + endif() +endif() + +# but we also need to check whether OpenVINO is installed +if(CMAKE_CROSSCOMPILING) + # we cannot check OpenVINO during cross-compile + set(pybind11_stubgen_AVAILABLE OFF) +else() + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import openvino" + RESULT_VARIABLE EXIT_CODE + OUTPUT_VARIABLE OUTPUT_TEXT + ERROR_VARIABLE ERROR_TEXT) + + # OpenVINO is not available because of import error + if(NOT EXIT_CODE EQUAL 0) + set(pybind11_stubgen_AVAILABLE OFF) + endif() +endif() + +if(pybind11_stubgen_AVAILABLE) + if(DEFINED ENV{CI} OR DEFINED ENV{TF_BUILD} OR DEFINED ENV{JENKINS_URL}) + set(ci_run ON) + endif() + + set(stub_files_location "${OpenVINOGenAI_BINARY_DIR}/src/python") + set(generated_files ${stub_files_location}/openvino_genai/__init__.pyi + ${stub_files_location}/openvino_genai/py_openvino_genai.pyi) + set_source_files_properties(${generated_files} PROPERTIES GENERATED ON) + + if(COMMAND find_host_program) + find_host_program(pybind11_stubgen NAMES pybind11-stubgen NO_CACHE REQUIRED) + else() + find_program(pybind11_stubgen NAMES pybind11-stubgen NO_CACHE REQUIRED) + endif() + + if(ci_run) + set(validation_command + COMMAND "${CMAKE_COMMAND}" + -D generated_pyi_files_location=${stub_files_location} + -D source_pyi_files_location=${CMAKE_CURRENT_SOURCE_DIR} + -P "${CMAKE_CURRENT_SOURCE_DIR}/compare_pyi.cmake") + set(validation_dependencies + "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.pyi" + "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/py_openvino_genai.pyi") + else() + set(copy_to_source_command + COMMAND "${CMAKE_COMMAND}" -E copy ${generated_files} "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/") + endif() + + set(output_file pybind11_stub_gen_completed.txt) + add_custom_command(OUTPUT ${output_file} + COMMAND "${CMAKE_COMMAND}" -E rm -f "${CMAKE_BINARY_DIR}/openvino_genai/__init__.pyi" + "${CMAKE_BINARY_DIR}/openvino_genai/py_openvino_genai.pyi" + COMMAND "${CMAKE_COMMAND}" -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${openvino_pythonpath}:$ENV{PYTHONPATH} + ${pybind11_stubgen} --output-dir ${stub_files_location} openvino_genai + ${validation_command} + ${copy_to_source_command} + COMMAND "${CMAKE_COMMAND}" -E copy ${generated_files} "${CMAKE_BINARY_DIR}/openvino_genai/" + COMMAND "${CMAKE_COMMAND}" -E touch ${output_file} + DEPENDS + ${python_sources} + ${validation_dependencies} + "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" + "${CMAKE_CURRENT_SOURCE_DIR}/compare_pyi.cmake" + COMMENT "[${pybind11_stubgen_dep}] Generate .pyi files" + VERBATIM) + + add_custom_target(${TARGET_NAME}_stub ALL DEPENDS ${output_file}) +elseif(OpenVINODeveloperPackage_FOUND) + # Produce warning message at build time as well + add_custom_command(OUTPUT pybind11_stub_gen_not_found.txt + COMMAND ${CMAKE_COMMAND} + -E cmake_echo_color --red "Warning: Please, install ${pybind11_stubgen_dep}") + add_custom_target(${TARGET_NAME}_stub ALL DEPENDS pybind11_stub_gen_not_found.txt) +else() + add_custom_target(${TARGET_NAME}_stub ALL) +endif() + +add_dependencies(${TARGET_NAME}_stub ${TARGET_NAME}) diff --git a/src/python/compare_pyi.cmake b/src/python/compare_pyi.cmake new file mode 100644 index 0000000000..62234d60d4 --- /dev/null +++ b/src/python/compare_pyi.cmake @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +foreach(var IN ITEMS generated_pyi_files_location source_pyi_files_location) + if(NOT DEFINED ${var}) + message(FATAL_ERROR "Variable ${var} is not defined") + endif() +endforeach() + +file(GLOB_RECURSE pyi_files ${generated_pyi_files_location}/*.pyi) + +# perform comparison of generated files with committed ones +foreach(pyi_file IN LISTS pyi_files) + string(REPLACE ${generated_pyi_files_location} ${source_pyi_files_location} commited_pyi_file "${pyi_file}") + if(NOT EXISTS "${commited_pyi_file}") + message(FATAL_ERROR "${commited_pyi_file} does not exists. Please, install pybind11-stubgen and generate .pyi files") + else() + execute_process(COMMAND "${CMAKE_COMMAND}" -E compare_files "${pyi_file}" "${commited_pyi_file}" + OUTPUT_VARIABLE output_message + ERROR_VARIABLE error_message + RESULT_VARIABLE exit_code + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT exit_code EQUAL 0) + message(FATAL_ERROR "File ${commited_pyi_file} is outdated and need to be regenerated with pybind11-stubgen") + endif() + endif() +endforeach() diff --git a/src/python/openvino_genai/__init__.pyi b/src/python/openvino_genai/__init__.pyi new file mode 100644 index 0000000000..5f402f8b55 --- /dev/null +++ b/src/python/openvino_genai/__init__.pyi @@ -0,0 +1,38 @@ +""" +openvino genai module namespace, exposing pipelines and configs to create these pipelines. +""" +from __future__ import annotations +import openvino as openvino +from openvino_genai.py_openvino_genai import Adapter +from openvino_genai.py_openvino_genai import AdapterConfig +from openvino_genai.py_openvino_genai import AggregationMode +from openvino_genai.py_openvino_genai import AutoencoderKL +from openvino_genai.py_openvino_genai import CLIPTextModel +from openvino_genai.py_openvino_genai import CLIPTextModelWithProjection +from openvino_genai.py_openvino_genai import CacheEvictionConfig +from openvino_genai.py_openvino_genai import ContinuousBatchingPipeline +from openvino_genai.py_openvino_genai import CppStdGenerator +from openvino_genai.py_openvino_genai import DecodedResults +from openvino_genai.py_openvino_genai import EncodedResults +from openvino_genai.py_openvino_genai import GenerationConfig +from openvino_genai.py_openvino_genai import GenerationResult +from openvino_genai.py_openvino_genai import Generator +from openvino_genai.py_openvino_genai import LLMPipeline +from openvino_genai.py_openvino_genai import PerfMetrics +from openvino_genai.py_openvino_genai import RawPerfMetrics +from openvino_genai.py_openvino_genai import Scheduler +from openvino_genai.py_openvino_genai import SchedulerConfig +from openvino_genai.py_openvino_genai import StopCriteria +from openvino_genai.py_openvino_genai import StreamerBase +from openvino_genai.py_openvino_genai import Text2ImagePipeline +from openvino_genai.py_openvino_genai import TokenizedInputs +from openvino_genai.py_openvino_genai import Tokenizer +from openvino_genai.py_openvino_genai import UNet2DConditionModel +from openvino_genai.py_openvino_genai import VLMPipeline +from openvino_genai.py_openvino_genai import WhisperGenerationConfig +from openvino_genai.py_openvino_genai import WhisperPipeline +from openvino_genai.py_openvino_genai import draft_model +import os as os +from . import py_openvino_genai +__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'GenerationConfig', 'GenerationResult', 'Generator', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPipeline', 'draft_model', 'openvino', 'os', 'py_openvino_genai'] +__version__: str = '2024.5.0.0' diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi new file mode 100644 index 0000000000..8fab02bc47 --- /dev/null +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -0,0 +1,1684 @@ +""" +Pybind11 binding for Whisper Pipeline +""" +from __future__ import annotations +import openvino._pyopenvino +import os +import typing +__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'ImageGenerationConfig', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPipeline', 'draft_model'] +class Adapter: + """ + Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. + """ + def __bool__(self) -> bool: + ... + @typing.overload + def __init__(self) -> None: + ... + @typing.overload + def __init__(self, path: os.PathLike) -> None: + """ + Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. + path (os.PathLike): Path to adapter file in safetensors format. + """ +class AdapterConfig: + """ + Adapter config that defines a combination of LoRA adapters with blending parameters. + """ + class Mode: + """ + Members: + + MODE_AUTO + + MODE_DYNAMIC + + MODE_STATIC_RANK + + MODE_STATIC + + MODE_FUSE + """ + MODE_AUTO: typing.ClassVar[AdapterConfig.Mode] # value = + MODE_DYNAMIC: typing.ClassVar[AdapterConfig.Mode] # value = + MODE_FUSE: typing.ClassVar[AdapterConfig.Mode] # value = + MODE_STATIC: typing.ClassVar[AdapterConfig.Mode] # value = + MODE_STATIC_RANK: typing.ClassVar[AdapterConfig.Mode] # value = + __members__: typing.ClassVar[dict[str, AdapterConfig.Mode]] # value = {'MODE_AUTO': , 'MODE_DYNAMIC': , 'MODE_STATIC_RANK': , 'MODE_STATIC': , 'MODE_FUSE': } + def __eq__(self, other: typing.Any) -> bool: + ... + def __getstate__(self) -> int: + ... + def __hash__(self) -> int: + ... + def __index__(self) -> int: + ... + def __init__(self, value: int) -> None: + ... + def __int__(self) -> int: + ... + def __ne__(self, other: typing.Any) -> bool: + ... + def __repr__(self) -> str: + ... + def __setstate__(self, state: int) -> None: + ... + def __str__(self) -> str: + ... + @property + def name(self) -> str: + ... + @property + def value(self) -> int: + ... + def __bool__(self) -> bool: + ... + @typing.overload + def __init__(self, mode: AdapterConfig.Mode = ...) -> None: + ... + @typing.overload + def __init__(self, adapter: Adapter, alpha: float, mode: AdapterConfig.Mode = ...) -> None: + ... + @typing.overload + def __init__(self, adapter: Adapter, mode: AdapterConfig.Mode = ...) -> None: + ... + @typing.overload + def __init__(self, adapters: list[Adapter], mode: AdapterConfig.Mode = ...) -> None: + ... + @typing.overload + def __init__(self, adapters: list[tuple[Adapter, float]], mode: AdapterConfig.Mode = ...) -> None: + ... + @typing.overload + def add(self, adapter: Adapter, alpha: float) -> AdapterConfig: + ... + @typing.overload + def add(self, adapter: Adapter) -> AdapterConfig: + ... + def get_adapters(self) -> list[Adapter]: + ... + def get_alpha(self, adapter: Adapter) -> float: + ... + def remove(self, adapter: Adapter) -> AdapterConfig: + ... + def set_alpha(self, adapter: Adapter, alpha: float) -> AdapterConfig: + ... +class AggregationMode: + """ + Represents the mode of per-token score aggregation when determining least important tokens for eviction from cache + :param AggregationMode.SUM: In this mode the importance scores of each token will be summed after each step of generation + :param AggregationMode.NORM_SUM: Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated) of a given token in cache + + Members: + + SUM + + NORM_SUM + """ + NORM_SUM: typing.ClassVar[AggregationMode] # value = + SUM: typing.ClassVar[AggregationMode] # value = + __members__: typing.ClassVar[dict[str, AggregationMode]] # value = {'SUM': , 'NORM_SUM': } + def __eq__(self, other: typing.Any) -> bool: + ... + def __getstate__(self) -> int: + ... + def __hash__(self) -> int: + ... + def __index__(self) -> int: + ... + def __init__(self, value: int) -> None: + ... + def __int__(self) -> int: + ... + def __ne__(self, other: typing.Any) -> bool: + ... + def __repr__(self) -> str: + ... + def __setstate__(self, state: int) -> None: + ... + def __str__(self) -> str: + ... + @property + def name(self) -> str: + ... + @property + def value(self) -> int: + ... +class AutoencoderKL: + """ + AutoencoderKL class. + """ + class Config: + """ + This class is used for storing AutoencoderKL config. + """ + block_out_channels: list[int] + in_channels: int + latent_channels: int + out_channels: int + scaling_factor: float + def __init__(self, config_path: os.PathLike) -> None: + ... + @typing.overload + def __init__(self, vae_decoder_path: os.PathLike) -> None: + """ + AutoencoderKL class initialized only with decoder model. + vae_decoder_path (os.PathLike): VAE decoder directory. + """ + @typing.overload + def __init__(self, vae_encoder_path: os.PathLike, vae_decoder_path: os.PathLike) -> None: + """ + AutoencoderKL class initialized with both encoder and decoder models. + vae_encoder_path (os.PathLike): VAE encoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. + """ + @typing.overload + def __init__(self, vae_decoder_path: os.PathLike, device: str, **kwargs) -> None: + """ + AutoencoderKL class initialized only with decoder model. + vae_decoder_path (os.PathLike): VAE decoder directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + """ + @typing.overload + def __init__(self, vae_encoder_path: os.PathLike, vae_decoder_path: os.PathLike, device: str, **kwargs) -> None: + """ + AutoencoderKL class initialized only with both encoder and decoder models. + vae_encoder_path (os.PathLike): VAE encoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + """ + @typing.overload + def __init__(self, model: AutoencoderKL) -> None: + """ + AutoencoderKL model + AutoencoderKL class. + model (AutoencoderKL): AutoencoderKL model. + """ + def compile(self, device: str, **kwargs) -> None: + """ + device on which inference will be done + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def decode(self, latent: openvino._pyopenvino.Tensor) -> openvino._pyopenvino.Tensor: + ... + def encode(self, image: openvino._pyopenvino.Tensor) -> openvino._pyopenvino.Tensor: + ... + def get_config(self) -> AutoencoderKL.Config: + ... + def get_vae_scale_factor(self) -> int: + ... + def reshape(self, batch_size: int, height: int, width: int) -> AutoencoderKL: + ... +class CLIPTextModel: + """ + CLIPTextModel class. + """ + class Config: + """ + This class is used for storing CLIPTextModel config. + """ + max_position_embeddings: int + num_hidden_layers: int + def __init__(self, config_path: str) -> None: + ... + @typing.overload + def __init__(self, root_dir: os.PathLike) -> None: + """ + CLIPTextModel class + root_dir (os.PathLike): Model root directory. + """ + @typing.overload + def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: + """ + CLIPTextModel class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + """ + @typing.overload + def __init__(self, model: CLIPTextModel) -> None: + """ + CLIPText model + CLIPTextModel class + model (CLIPTextModel): CLIPText model + """ + def compile(self, device: str, **kwargs) -> None: + """ + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def get_config(self) -> CLIPTextModel.Config: + ... + def get_output_tensor(self, idx: int) -> openvino._pyopenvino.Tensor: + ... + def infer(self, pos_prompt: str, neg_prompt: str, do_classifier_free_guidance: bool) -> openvino._pyopenvino.Tensor: + ... + def reshape(self, batch_size: int) -> CLIPTextModel: + ... + def set_adapters(self, adapters: AdapterConfig | None) -> None: + ... +class CLIPTextModelWithProjection: + """ + CLIPTextModelWithProjection class. + """ + class Config: + """ + This class is used for storing CLIPTextModelWithProjection config. + """ + max_position_embeddings: int + num_hidden_layers: int + def __init__(self, config_path: os.PathLike) -> None: + ... + @typing.overload + def __init__(self, root_dir: os.PathLike) -> None: + """ + CLIPTextModelWithProjection class + root_dir (os.PathLike): Model root directory. + """ + @typing.overload + def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: + """ + CLIPTextModelWithProjection class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + """ + @typing.overload + def __init__(self, model: CLIPTextModelWithProjection) -> None: + """ + CLIPTextModelWithProjection model + CLIPTextModelWithProjection class + model (CLIPTextModelWithProjection): CLIPTextModelWithProjection model + """ + def compile(self, device: str, **kwargs) -> None: + """ + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def get_config(self) -> CLIPTextModelWithProjection.Config: + ... + def get_output_tensor(self, idx: int) -> openvino._pyopenvino.Tensor: + ... + def infer(self, pos_prompt: str, neg_prompt: str, do_classifier_free_guidance: bool) -> openvino._pyopenvino.Tensor: + ... + def reshape(self, batch_size: int) -> CLIPTextModelWithProjection: + ... + def set_adapters(self, adapters: AdapterConfig | None) -> None: + ... +class CacheEvictionConfig: + """ + + Configuration struct for the cache eviction algorithm. + :param start_size: Number of tokens in the *beginning* of KV cache that should be retained in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for this pipeline. + :type start_size: int + + :param recent_size: Number of tokens in the *end* of KV cache that should be retained in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for this pipeline. + :type recent_size: int + + :param max_cache_size: Maximum number of tokens that should be kept in the KV cache. The evictable block area will be located between the "start" and "recent" blocks and its size will be calculated as (`max_cache_size` - `start_size` - `recent_size`). Must be non-zero, larger than (`start_size` + `recent_size`), and a multiple of the KV cache block size for this pipeline. Note that since only the completely filled blocks are evicted, the actual maximum per-sequence KV cache size in tokens may be up to (`max_cache_size` + `SchedulerConfig.block_size - 1`). + :type max_cache_size: int + + :param aggregation_mode: The mode used to compute the importance of tokens for eviction + :type aggregation_mode: openvino_genai.AggregationMode + """ + aggregation_mode: AggregationMode + def __init__(self, start_size: int, recent_size: int, max_cache_size: int, aggregation_mode: AggregationMode) -> None: + ... + def get_evictable_size(self) -> int: + ... + def get_max_cache_size(self) -> int: + ... + def get_recent_size(self) -> int: + ... + def get_start_size(self) -> int: + ... +class ContinuousBatchingPipeline: + """ + This class is used for generation with LLMs with continuous batchig + """ + @typing.overload + def __init__(self, models_path: str, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None: + ... + @typing.overload + def __init__(self, models_path: str, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None: + ... + @typing.overload + def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle: + ... + @typing.overload + def add_request(self, request_id: int, prompt: str, sampling_params: GenerationConfig) -> GenerationHandle: + ... + @typing.overload + def generate(self, input_ids: list[openvino._pyopenvino.Tensor], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[EncodedGenerationResult]: + ... + @typing.overload + def generate(self, prompts: list[str], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[GenerationResult]: + ... + def get_config(self) -> GenerationConfig: + ... + def get_metrics(self) -> PipelineMetrics: + ... + def get_tokenizer(self) -> Tokenizer: + ... + def has_non_finished_requests(self) -> bool: + ... + def step(self) -> None: + ... +class CppStdGenerator(Generator): + """ + This class wraps std::mt19937 pseudo-random generator. + """ + def __init__(self, seed: int) -> None: + ... + def next(self) -> float: + ... + def randn_tensor(self, shape: openvino._pyopenvino.Shape) -> openvino._pyopenvino.Tensor: + ... +class DecodedResults: + """ + + Structure to store resulting batched text outputs and scores for each batch. + The first num_return_sequences elements correspond to the first batch element. + + Parameters: + texts: vector of resulting sequences. + scores: scores for each sequence. + metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. + """ + def __init__(self) -> None: + ... + def __str__(self) -> str: + ... + @property + def perf_metrics(self) -> PerfMetrics: + ... + @property + def scores(self) -> list[float]: + ... + @property + def texts(self) -> list[str]: + ... +class EncodedGenerationResult: + """ + + GenerationResult stores resulting batched tokens and scores. + + Parameters: + request_id: obsolete when handle API is approved as handle will connect results with prompts. + generation_ids: in a generic case we have multiple generation results per initial prompt + depending on sampling parameters (e.g. beam search or parallel sampling). + scores: scores. + status: status of generation. The following values are possible: + RUNNING = 0 - Default status for ongoing generation. + FINISHED = 1 - Status set when generation has been finished. + IGNORED = 2 - Status set when generation run into out-of-memory condition and could not be continued. + DROPPED_BY_PIPELINE = 3 - Currently not used, TODO: implement abort functionality. + DROPPED_BY_HANDLE = 4 - Status set when generation handle is dropped. + + """ + m_generation_ids: list[list[int]] + m_scores: list[float] + def __init__(self) -> None: + ... + @property + def m_request_id(self) -> int: + ... +class EncodedResults: + """ + + Structure to store resulting batched tokens and scores for each batch sequence. + The first num_return_sequences elements correspond to the first batch element. + In the case if results decoded with beam search and random sampling scores contain + sum of logarithmic probabilities for each token in the sequence. In the case + of greedy decoding scores are filled with zeros. + + Parameters: + tokens: sequence of resulting tokens. + scores: sum of logarithmic probabilities of all tokens in the sequence. + metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. + """ + @property + def perf_metrics(self) -> PerfMetrics: + ... + @property + def scores(self) -> list[float]: + ... + @property + def tokens(self) -> list[list[int]]: + ... +class GenerationConfig: + """ + + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + be used while greedy and beam search parameters will not affect decoding at all. + + Parameters: + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + ignore_eos: if set to true, then generation will not stop even if token is met. + eos_token_id: token_id of (end of sentence) + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching. + stop_strings: a set of strings that will cause pipeline to stop generating further tokens. + include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) + stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. + echo: if set to true, the model will echo the prompt in the output. + logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. + Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + + Beam search specific parameters: + num_beams: number of beams for beam search. 1 disables beam search. + num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time. + length_penalty: exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while + length_penalty < 0.0 encourages shorter sequences. + num_return_sequences: the number of sequences to return for grouped beam search decoding. + no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + + Random sampling parameters: + temperature: the value used to modulate token probabilities for random sampling. + top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. + do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + """ + adapters: AdapterConfig | None + assistant_confidence_threshold: float + diversity_penalty: float + do_sample: bool + echo: bool + eos_token_id: int + frequency_penalty: float + ignore_eos: bool + include_stop_str_in_output: bool + length_penalty: float + logprobs: int + max_length: int + max_new_tokens: int + min_new_tokens: int + no_repeat_ngram_size: int + num_assistant_tokens: int + num_beam_groups: int + num_beams: int + num_return_sequences: int + presence_penalty: float + repetition_penalty: float + rng_seed: int + stop_criteria: StopCriteria + stop_strings: set[str] + stop_token_ids: set[int] + temperature: float + top_k: int + top_p: float + @typing.overload + def __init__(self, json_path: os.PathLike) -> None: + """ + path where generation_config.json is stored + """ + @typing.overload + def __init__(self, **kwargs) -> None: + ... + def is_beam_search(self) -> bool: + ... + def is_greedy_decoding(self) -> bool: + ... + def is_speculative_decoding(self) -> bool: + ... + def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None: + ... + def update_generation_config(self, config_map: dict[str, openvino._pyopenvino.OVAny]) -> None: + ... +class GenerationFinishReason: + """ + Members: + + NONE + + STOP + + LENGTH + """ + LENGTH: typing.ClassVar[GenerationFinishReason] # value = + NONE: typing.ClassVar[GenerationFinishReason] # value = + STOP: typing.ClassVar[GenerationFinishReason] # value = + __members__: typing.ClassVar[dict[str, GenerationFinishReason]] # value = {'NONE': , 'STOP': , 'LENGTH': } + def __eq__(self, other: typing.Any) -> bool: + ... + def __getstate__(self) -> int: + ... + def __hash__(self) -> int: + ... + def __index__(self) -> int: + ... + def __init__(self, value: int) -> None: + ... + def __int__(self) -> int: + ... + def __ne__(self, other: typing.Any) -> bool: + ... + def __repr__(self) -> str: + ... + def __setstate__(self, state: int) -> None: + ... + def __str__(self) -> str: + ... + @property + def name(self) -> str: + ... + @property + def value(self) -> int: + ... +class GenerationHandle: + def back(self) -> dict[int, GenerationOutput]: + ... + def can_read(self) -> bool: + ... + def drop(self) -> None: + ... + def get_status(self) -> GenerationStatus: + ... + def read(self) -> dict[int, GenerationOutput]: + ... + def read_all(self) -> list[GenerationOutput]: + ... +class GenerationOutput: + finish_reason: GenerationFinishReason + generated_ids: list[int] + generated_log_probs: list[float] + score: float +class GenerationResult: + """ + + GenerationResult stores resulting batched tokens and scores. + + Parameters: + request_id: obsolete when handle API is approved as handle will connect results with prompts. + generation_ids: in a generic case we have multiple generation results per initial prompt + depending on sampling parameters (e.g. beam search or parallel sampling). + scores: scores. + status: status of generation. The following values are possible: + RUNNING = 0 - Default status for ongoing generation. + FINISHED = 1 - Status set when generation has been finished. + IGNORED = 2 - Status set when generation run into out-of-memory condition and could not be continued. + DROPPED_BY_PIPELINE = 3 - Currently not used, TODO: implement abort functionality. + DROPPED_BY_HANDLE = 4 - Status set when generation handle is dropped. + + """ + m_generation_ids: list[str] + m_scores: list[float] + def __init__(self) -> None: + ... + def __repr__(self) -> str: + ... + def get_generation_ids(self) -> list[str]: + ... + @property + def m_request_id(self) -> int: + ... +class GenerationStatus: + """ + Members: + + RUNNING + + FINISHED + + IGNORED + + DROPPED_BY_PIPELINE + + DROPPED_BY_HANDLE + """ + DROPPED_BY_HANDLE: typing.ClassVar[GenerationStatus] # value = + DROPPED_BY_PIPELINE: typing.ClassVar[GenerationStatus] # value = + FINISHED: typing.ClassVar[GenerationStatus] # value = + IGNORED: typing.ClassVar[GenerationStatus] # value = + RUNNING: typing.ClassVar[GenerationStatus] # value = + __members__: typing.ClassVar[dict[str, GenerationStatus]] # value = {'RUNNING': , 'FINISHED': , 'IGNORED': , 'DROPPED_BY_PIPELINE': , 'DROPPED_BY_HANDLE': } + def __eq__(self, other: typing.Any) -> bool: + ... + def __getstate__(self) -> int: + ... + def __hash__(self) -> int: + ... + def __index__(self) -> int: + ... + def __init__(self, value: int) -> None: + ... + def __int__(self) -> int: + ... + def __ne__(self, other: typing.Any) -> bool: + ... + def __repr__(self) -> str: + ... + def __setstate__(self, state: int) -> None: + ... + def __str__(self) -> str: + ... + @property + def name(self) -> str: + ... + @property + def value(self) -> int: + ... +class Generator: + """ + This class is used for storing pseudo-random generator. + """ + def __init__(self) -> None: + ... +class ImageGenerationConfig: + """ + This class is used for storing generation config for image generation pipeline. + """ + adapters: AdapterConfig | None + generator: Generator + guidance_scale: float + height: int + negative_prompt: str | None + negative_prompt_2: str | None + negative_prompt_3: str | None + num_images_per_prompt: int + num_inference_steps: int + prompt_2: str | None + prompt_3: str | None + strength: float + width: int + def __init__(self) -> None: + ... + def update_generation_config(self, **kwargs) -> None: + ... + def validate(self) -> None: + ... +class LLMPipeline: + """ + This class is used for generation with LLMs + """ + def __call__(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: + """ + Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. + + :param inputs: inputs in the form of string, list of strings or tokenized input_ids + :type inputs: str, List[str], ov.genai.TokenizedInputs, or ov.Tensor + + :param generation_config: generation_config + :type generation_config: GenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. + :type : Dict + + :return: return results in encoded, or decoded form depending on inputs type + :rtype: DecodedResults, EncodedResults, str + + + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + be used while greedy and beam search parameters will not affect decoding at all. + + Parameters: + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + ignore_eos: if set to true, then generation will not stop even if token is met. + eos_token_id: token_id of (end of sentence) + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching. + stop_strings: a set of strings that will cause pipeline to stop generating further tokens. + include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) + stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. + echo: if set to true, the model will echo the prompt in the output. + logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. + Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + + Beam search specific parameters: + num_beams: number of beams for beam search. 1 disables beam search. + num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time. + length_penalty: exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while + length_penalty < 0.0 encourages shorter sequences. + num_return_sequences: the number of sequences to return for grouped beam search decoding. + no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + + Random sampling parameters: + temperature: the value used to modulate token probabilities for random sampling. + top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. + do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + """ + @typing.overload + def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None: + """ + LLMPipeline class constructor for manually created openvino_genai.Tokenizer. + models_path (os.PathLike): Path to the model file. + tokenizer (openvino_genai.Tokenizer): tokenizer object. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. + kwargs: Device properties. + """ + @typing.overload + def __init__(self, models_path: os.PathLike, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None: + """ + LLMPipeline class constructor. + models_path (os.PathLike): Path to the model file. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. + kwargs: Device properties. + """ + def finish_chat(self) -> None: + ... + def generate(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: + """ + Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. + + :param inputs: inputs in the form of string, list of strings or tokenized input_ids + :type inputs: str, List[str], ov.genai.TokenizedInputs, or ov.Tensor + + :param generation_config: generation_config + :type generation_config: GenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. + :type : Dict + + :return: return results in encoded, or decoded form depending on inputs type + :rtype: DecodedResults, EncodedResults, str + + + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + be used while greedy and beam search parameters will not affect decoding at all. + + Parameters: + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + ignore_eos: if set to true, then generation will not stop even if token is met. + eos_token_id: token_id of (end of sentence) + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching. + stop_strings: a set of strings that will cause pipeline to stop generating further tokens. + include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) + stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. + echo: if set to true, the model will echo the prompt in the output. + logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. + Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + + Beam search specific parameters: + num_beams: number of beams for beam search. 1 disables beam search. + num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time. + length_penalty: exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while + length_penalty < 0.0 encourages shorter sequences. + num_return_sequences: the number of sequences to return for grouped beam search decoding. + no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + + Random sampling parameters: + temperature: the value used to modulate token probabilities for random sampling. + top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. + do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + """ + def get_generation_config(self) -> GenerationConfig: + ... + def get_tokenizer(self) -> Tokenizer: + ... + def set_generation_config(self, config: GenerationConfig) -> None: + ... + def start_chat(self, system_message: str = '') -> None: + ... +class MeanStdPair: + def __init__(self) -> None: + ... + def __iter__(self) -> typing.Iterator[float]: + ... + @property + def mean(self) -> float: + ... + @property + def std(self) -> float: + ... +class PerfMetrics: + """ + + Holds performance metrics for each generate call. + + PerfMetrics holds fields with mean and standard deviations for the following metrics: + - Time To the First Token (TTFT), ms + - Time per Output Token (TPOT), ms/token + - Generate total duration, ms + - Tokenization duration, ms + - Detokenization duration, ms + - Throughput, tokens/s + + Additional fields include: + - Load time, ms + - Number of generated tokens + - Number of tokens in the input prompt + + Preferable way to access values is via get functions. Getters calculate mean and std values from raw_metrics and return pairs. + If mean and std were already calculated, getters return cached values. + + :param get_load_time: Returns the load time in milliseconds. + :type get_load_time: float + + :param get_num_generated_tokens: Returns the number of generated tokens. + :type get_num_generated_tokens: int + + :param get_num_input_tokens: Returns the number of tokens in the input prompt. + :type get_num_input_tokens: int + + :param get_ttft: Returns the mean and standard deviation of TTFT in milliseconds. + :type get_ttft: MeanStdPair + + :param get_tpot: Returns the mean and standard deviation of TPOT in milliseconds. + :type get_tpot: MeanStdPair + + :param get_throughput: Returns the mean and standard deviation of throughput in tokens per second. + :type get_throughput: MeanStdPair + + :param get_generate_duration: Returns the mean and standard deviation of generate durations in milliseconds. + :type get_generate_duration: MeanStdPair + + :param get_tokenization_duration: Returns the mean and standard deviation of tokenization durations in milliseconds. + :type get_tokenization_duration: MeanStdPair + + :param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds. + :type get_detokenization_duration: MeanStdPair + + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. + :type raw_metrics: RawPerfMetrics + """ + def __add__(self, metrics: PerfMetrics) -> PerfMetrics: + ... + def __iadd__(self, right: PerfMetrics) -> PerfMetrics: + ... + def __init__(self) -> None: + ... + def get_detokenization_duration(self) -> MeanStdPair: + ... + def get_generate_duration(self) -> MeanStdPair: + ... + def get_inference_duration(self) -> MeanStdPair: + ... + def get_ipot(self) -> MeanStdPair: + ... + def get_load_time(self) -> float: + ... + def get_num_generated_tokens(self) -> int: + ... + def get_num_input_tokens(self) -> int: + ... + def get_throughput(self) -> MeanStdPair: + ... + def get_tokenization_duration(self) -> MeanStdPair: + ... + def get_tpot(self) -> MeanStdPair: + ... + def get_ttft(self) -> MeanStdPair: + ... + @property + def raw_metrics(self) -> RawPerfMetrics: + ... +class PipelineMetrics: + """ + + Contains general pipeline metrics, either aggregated throughout the lifetime of the generation pipeline + or measured at the previous generation step. + + :param requests: Number of requests to be processed by the pipeline. + :type requests: int + + :param scheduled_requests: Number of requests that were scheduled for processing at the previous step of the pipeline. + :type scheduled_requests: int + + :param cache_usage: Percentage of KV cache usage in the last generation step. + :type cache_usage: float + + :param max_cache_usage: Max KV cache usage during the lifetime of the pipeline in % + :type max_cache_usage: float + + + :param avg_cache_usage: Running average of the KV cache usage (in %) during the lifetime of the pipeline, with max window size of 1000 steps + :type avg_cache_usage: float + """ + def __init__(self) -> None: + ... + @property + def avg_cache_usage(self) -> float: + ... + @property + def cache_usage(self) -> float: + ... + @property + def max_cache_usage(self) -> float: + ... + @property + def requests(self) -> int: + ... + @property + def scheduled_requests(self) -> int: + ... +class RawPerfMetrics: + """ + + Structure with raw performance metrics for each generation before any statistics are calculated. + + :param generate_durations: Durations for each generate call in microseconds. + :type generate_durations: List[MicroSeconds] + + :param tokenization_durations: Durations for the tokenization process in microseconds. + :type tokenization_durations: List[MicroSeconds] + + :param detokenization_durations: Durations for the detokenization process in microseconds. + :type detokenization_durations: List[MicroSeconds] + + :param m_times_to_first_token: Times to the first token for each call in microseconds. + :type m_times_to_first_token: List[MicroSeconds] + + :param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds. + :type m_new_token_times: List[MilliSeconds] + + :param m_batch_sizes: Batch sizes for each generate call. + :type m_batch_sizes: List[int] + + :param m_durations: Total durations for each generate call in microseconds. + :type m_durations: List[MicroSeconds] + + :param num_generated_tokens: Total number of tokens generated. + :type num_generated_tokens: int + + :param num_input_tokens: Total number of tokens in the input prompt. + :type num_input_tokens: int + """ + def __init__(self) -> None: + ... + @property + def detokenization_durations(self) -> list[float]: + ... + @property + def generate_durations(self) -> list[float]: + ... + @property + def m_batch_sizes(self) -> list[int]: + ... + @property + def m_durations(self) -> list[float]: + ... + @property + def m_new_token_times(self) -> list[float]: + ... + @property + def m_times_to_first_token(self) -> list[float]: + ... + @property + def tokenization_durations(self) -> list[float]: + ... +class Scheduler: + """ + Scheduler for image generation pipelines. + """ + class Type: + """ + Members: + + AUTO + + LCM + + LMS_DISCRETE + + DDIM + + EULER_DISCRETE + + FLOW_MATCH_EULER_DISCRETE + """ + AUTO: typing.ClassVar[Scheduler.Type] # value = + DDIM: typing.ClassVar[Scheduler.Type] # value = + EULER_DISCRETE: typing.ClassVar[Scheduler.Type] # value = + FLOW_MATCH_EULER_DISCRETE: typing.ClassVar[Scheduler.Type] # value = + LCM: typing.ClassVar[Scheduler.Type] # value = + LMS_DISCRETE: typing.ClassVar[Scheduler.Type] # value = + __members__: typing.ClassVar[dict[str, Scheduler.Type]] # value = {'AUTO': , 'LCM': , 'LMS_DISCRETE': , 'DDIM': , 'EULER_DISCRETE': , 'FLOW_MATCH_EULER_DISCRETE': } + def __eq__(self, other: typing.Any) -> bool: + ... + def __getstate__(self) -> int: + ... + def __hash__(self) -> int: + ... + def __index__(self) -> int: + ... + def __init__(self, value: int) -> None: + ... + def __int__(self) -> int: + ... + def __ne__(self, other: typing.Any) -> bool: + ... + def __repr__(self) -> str: + ... + def __setstate__(self, state: int) -> None: + ... + def __str__(self) -> str: + ... + @property + def name(self) -> str: + ... + @property + def value(self) -> int: + ... + @staticmethod + def from_config(scheduler_config_path: os.PathLike, scheduler_type: Scheduler.Type = ...) -> Scheduler: + ... +class SchedulerConfig: + """ + + SchedulerConfig to construct ContinuousBatchingPipeline + + Parameters: + max_num_batched_tokens: a maximum number of tokens to batch (in contrast to max_batch_size which combines + independent sequences, we consider total amount of tokens in a batch). + num_kv_blocks: total number of KV blocks available to scheduler logic. + cache_size: total size of KV cache in GB. + block_size: block size for KV cache. + dynamic_split_fuse: whether to split prompt / generate to different scheduling phases. + + vLLM-like settings: + max_num_seqs: max number of scheduled sequences (you can think of it as "max batch size"). + enable_prefix_caching: Enable caching of KV-blocks. + When turned on all previously calculated KV-caches are kept in memory for future usages. + KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released. + This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. + When turend off only KV-cache required for batch calculation is kept in memory and + when a sequence has finished genegartion its cache is released. + """ + cache_eviction_config: CacheEvictionConfig + cache_size: int + dynamic_split_fuse: bool + enable_prefix_caching: bool + max_num_batched_tokens: int + max_num_seqs: int + num_kv_blocks: int + use_cache_eviction: bool + def __init__(self) -> None: + ... +class StopCriteria: + """ + + StopCriteria controls the stopping condition for grouped beam search. + + The following values are possible: + "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates. + "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates. + "openvino_genai.StopCriteria.NEVER" stops when there cannot be better candidates. + + + Members: + + EARLY + + HEURISTIC + + NEVER + """ + EARLY: typing.ClassVar[StopCriteria] # value = + HEURISTIC: typing.ClassVar[StopCriteria] # value = + NEVER: typing.ClassVar[StopCriteria] # value = + __members__: typing.ClassVar[dict[str, StopCriteria]] # value = {'EARLY': , 'HEURISTIC': , 'NEVER': } + def __eq__(self, other: typing.Any) -> bool: + ... + def __getstate__(self) -> int: + ... + def __hash__(self) -> int: + ... + def __index__(self) -> int: + ... + def __init__(self, value: int) -> None: + ... + def __int__(self) -> int: + ... + def __ne__(self, other: typing.Any) -> bool: + ... + def __repr__(self) -> str: + ... + def __setstate__(self, state: int) -> None: + ... + def __str__(self) -> str: + ... + @property + def name(self) -> str: + ... + @property + def value(self) -> int: + ... +class StreamerBase: + """ + + Base class for streamers. In order to use inherit from from this class and implement put, and methods. + """ + def __init__(self) -> None: + ... + def end(self) -> None: + """ + End is called at the end of generation. It can be used to flush cache if your own streamer has one + """ + def put(self, token: int) -> bool: + """ + Put is called every time new token is decoded. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops + """ +class Text2ImagePipeline: + """ + This class is used for generation with text-to-image models. + """ + @staticmethod + def latent_consistency_model(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Text2ImagePipeline: + ... + @staticmethod + def stable_diffusion(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Text2ImagePipeline: + ... + @staticmethod + def stable_diffusion_xl(scheduler: Scheduler, clip_text_model: CLIPTextModel, clip_text_model_with_projection: CLIPTextModelWithProjection, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Text2ImagePipeline: + ... + @typing.overload + def __init__(self, models_path: os.PathLike) -> None: + """ + Text2ImagePipeline class constructor. + models_path (os.PathLike): Path to the folder with exported model files. + """ + @typing.overload + def __init__(self, models_path: os.PathLike, device: str, **kwargs) -> None: + """ + Text2ImagePipeline class constructor. + models_path (os.PathLike): Path with exported model files. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Text2ImagePipeline properties + """ + def compile(self, device: str, **kwargs) -> None: + """ + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def generate(self, prompt: str, **kwargs) -> openvino._pyopenvino.Tensor: + """ + Generates images for text-to-image models. + + :param prompt: input prompt + :type prompt: str + + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. + + Expected parameters list: + prompt_2: str - second prompt, + prompt_3: str - third prompt, + negative_prompt: str - negative prompt, + negative_prompt_2: str - second negative prompt, + negative_prompt_3: str - third negative prompt, + num_images_per_prompt: int - number of images, that should be generated per prompt, + guidance_scale: float - guidance scale, + generation_config: GenerationConfig, + height: int - height of resulting images, + width: int - width of resulting images, + num_inference_steps: int - number of inference steps, + generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator + adapters: LoRA adapters + strength: strength for image to image generation. 1.0f means initial image is fully noised + + :return: ov.Tensor with resulting images + :rtype: ov.Tensor + """ + def get_generation_config(self) -> ImageGenerationConfig: + ... + def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None: + ... + def set_generation_config(self, generation_config: ImageGenerationConfig) -> None: + ... + def set_scheduler(self, scheduler: Scheduler) -> None: + ... +class TokenizedInputs: + attention_mask: openvino._pyopenvino.Tensor + input_ids: openvino._pyopenvino.Tensor + def __init__(self, input_ids: openvino._pyopenvino.Tensor, attention_mask: openvino._pyopenvino.Tensor) -> None: + ... +class Tokenizer: + """ + openvino_genai.Tokenizer object is used to initialize Tokenizer + if it's located in a different path than the main model. + """ + def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}) -> None: + ... + def apply_chat_template(self, history: list[dict[str, str]], add_generation_prompt: bool, chat_template: str = '') -> str: + """ + Embeds input prompts with special tags for a chat scenario. + """ + @typing.overload + def decode(self, tokens: list[int]) -> str: + """ + Decode a sequence into a string prompt. + """ + @typing.overload + def decode(self, tokens: openvino._pyopenvino.Tensor) -> list[str]: + """ + Decode tensor into a list of string prompts. + """ + @typing.overload + def decode(self, tokens: list[list[int]]) -> list[str]: + """ + Decode a batch of tokens into a list of string prompt. + """ + @typing.overload + def encode(self, prompts: list[str], add_special_tokens: bool = True) -> TokenizedInputs: + """ + Encodes a list of prompts into tokenized inputs. + """ + @typing.overload + def encode(self, prompt: str, add_special_tokens: bool = True) -> TokenizedInputs: + """ + Encodes a single prompt into tokenized input. + """ + def get_bos_token(self) -> str: + ... + def get_bos_token_id(self) -> int: + ... + def get_eos_token(self) -> str: + ... + def get_eos_token_id(self) -> int: + ... + def get_pad_token(self) -> str: + ... + def get_pad_token_id(self) -> int: + ... + def set_chat_template(self, chat_template: str) -> None: + """ + Override a chat_template read from tokenizer_config.json. + """ +class UNet2DConditionModel: + """ + UNet2DConditionModel class. + """ + class Config: + """ + This class is used for storing UNet2DConditionModel config. + """ + in_channels: int + sample_size: int + time_cond_proj_dim: int + def __init__(self, config_path: os.PathLike) -> None: + ... + @typing.overload + def __init__(self, root_dir: os.PathLike) -> None: + """ + UNet2DConditionModel class + root_dir (os.PathLike): Model root directory. + """ + @typing.overload + def __init__(self, root_dir: os.PathLike, device: str, **kwargs) -> None: + """ + UNet2DConditionModel class + root_dir (os.PathLike): Model root directory. + device (str): Device on which inference will be done. + kwargs: Device properties. + """ + @typing.overload + def __init__(self, model: UNet2DConditionModel) -> None: + """ + UNet2DConditionModel model + UNet2DConditionModel class + model (UNet2DConditionModel): UNet2DConditionModel model + """ + def compile(self, device: str, **kwargs) -> None: + """ + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def get_config(self) -> UNet2DConditionModel.Config: + ... + def infer(self, sample: openvino._pyopenvino.Tensor, timestep: openvino._pyopenvino.Tensor) -> openvino._pyopenvino.Tensor: + ... + def reshape(self, batch_size: int, height: int, width: int, tokenizer_model_max_length: int) -> UNet2DConditionModel: + ... + def set_adapters(self, adapters: AdapterConfig | None) -> None: + ... + def set_hidden_states(self, tensor_name: str, encoder_hidden_states: openvino._pyopenvino.Tensor) -> None: + ... +class VLMPipeline: + """ + This class is used for generation with VLMs + """ + def __init__(self, models_path: os.PathLike, device: str, **kwargs) -> None: + """ + device on which inference will be done + VLMPipeline class constructor. + models_path (os.PathLike): Path to the folder with exported model files. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + kwargs: Device properties + """ + def finish_chat(self) -> None: + ... + @typing.overload + def generate(self, prompt: str, images: list[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> DecodedResults: + """ + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + + :param images: image or list of images + :type images: List[ov.Tensor] or ov.Tensor + + :param generation_config: generation_config + :type generation_config: GenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. + :type : Dict + + :return: return results in decoded form + :rtype: DecodedResults + """ + @typing.overload + def generate(self, prompt: str, images: openvino._pyopenvino.Tensor, generation_config: GenerationConfig, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> DecodedResults: + """ + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + + :param images: image or list of images + :type images: List[ov.Tensor] or ov.Tensor + + :param generation_config: generation_config + :type generation_config: GenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. + :type : Dict + + :return: return results in decoded form + :rtype: DecodedResults + """ + @typing.overload + def generate(self, prompt: str, **kwargs) -> DecodedResults: + """ + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. + + Expected parameters list: + image: ov.Tensor - input image, + images: List[ov.Tensor] - input images, + generation_config: GenerationConfig, + streamer: Callable[[str], bool], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped + + :return: return results in decoded form + :rtype: DecodedResults + """ + def get_generation_config(self) -> GenerationConfig: + ... + def get_tokenizer(self) -> Tokenizer: + ... + def set_chat_template(self, new_template: str) -> None: + ... + def set_generation_config(self, new_config: GenerationConfig) -> None: + ... + def start_chat(self, system_message: str = '') -> None: + ... +class WhisperDecodedResultChunk: + """ + + Structure to store decoded text with corresponding timestamps + + :param start_ts chunk start time in seconds + :param end_ts chunk end time in seconds + :param text chunk text + """ + def __init__(self) -> None: + ... + @property + def end_ts(self) -> float: + ... + @property + def start_ts(self) -> float: + ... + @property + def text(self) -> str: + ... +class WhisperDecodedResults(DecodedResults): + """ + + Structure to store resulting batched text outputs and scores for each batch. + The first num_return_sequences elements correspond to the first batch element. + + Parameters: + texts: vector of resulting sequences. + scores: scores for each sequence. + metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. + shunks: chunk of resulting sequences with timestamps + """ + @property + def chunks(self) -> list[WhisperDecodedResultChunk] | None: + ... +class WhisperGenerationConfig: + """ + + WhisperGenerationConfig + :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + :type max_length: int + + :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + :type max_new_tokens: int + + :param eos_token_id: End of stream token id. + :type eos_token_id: int + + Whisper specific parameters: + + :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. + :type decoder_start_token_id: int + + :param pad_token_id: Padding token id. + :type pad_token_id: int + + :param translate_token_id: Translate token id. + :type translate_token_id: int + + :param transcribe_token_id: Transcribe token id. + :type transcribe_token_id: int + + :param no_timestamps_token_id: No timestamps token id. + :type no_timestamps_token_id: int + + :param is_multilingual: + :type is_multilingual: bool + + :param begin_suppress_tokens: A list containing tokens that will be suppressed at the beginning of the sampling process. + :type begin_suppress_tokens: list[int] + + :param suppress_tokens: A list containing the non-speech tokens that will be suppressed during generation. + :type suppress_tokens: list[int] + + :param language: Language token to use for generation in the form of <|en|>. + You can find all the possible language tokens in the generation_config.json lang_to_id dictionary. + :type language: Optional[str] + + :param lang_to_id: Language token to token_id map. Initialized from the generation_config.json lang_to_id dictionary. + :type lang_to_id: Dict[str, int] + + :param task: Task to use for generation, either “translate” or “transcribe” + :type task: int + + :param return_timestamps: If `true` the pipeline will return timestamps along the text for *segments* of words in the text. + For instance, if you get + WhisperDecodedResultChunk + start_ts = 0.5 + end_ts = 1.5 + text = " Hi there!" + then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. + Note that a segment of text refers to a sequence of one or more words, rather than individual words. + :type return_timestamps: bool + """ + begin_suppress_tokens: list[int] + decoder_start_token_id: int + eos_token_id: int + is_multilingual: bool + lang_to_id: dict[str, int] + language: str | None + max_initial_timestamp_index: int + max_length: int + max_new_tokens: int + no_timestamps_token_id: int + pad_token_id: int + return_timestamps: bool + suppress_tokens: list[int] + task: str | None + transcribe_token_id: int + translate_token_id: int + @typing.overload + def __init__(self, json_path: os.PathLike) -> None: + """ + path where generation_config.json is stored + """ + @typing.overload + def __init__(self, **kwargs) -> None: + ... + def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None: + ... +class WhisperPipeline: + """ + Automatic speech recognition pipeline + """ + def __init__(self, models_path: os.PathLike, device: str, **kwargs) -> None: + """ + WhisperPipeline class constructor. + models_path (os.PathLike): Path to the model file. + device (str): Device to run the model on (e.g., CPU, GPU). + """ + def generate(self, raw_speech_input: list[float], generation_config: WhisperGenerationConfig | None = None, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> DecodedResults: + """ + High level generate that receives raw speech as a vector of floats and returns decoded output. + + :param raw_speech_input: inputs in the form of list of floats. Required to be normalized to near [-1, 1] range and have 16k Hz sampling rate. + :type raw_speech_input: List[float] + + :param generation_config: generation_config + :type generation_config: WhisperGenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped. + Streamer supported for short-form audio (< 30 seconds) with `return_timestamps=False` only + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to WhisperGenerationConfig fields. + :type : Dict + + :return: return results in encoded, or decoded form depending on inputs type + :rtype: DecodedResults + + + WhisperGenerationConfig + :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + :type max_length: int + + :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + :type max_new_tokens: int + + :param eos_token_id: End of stream token id. + :type eos_token_id: int + + Whisper specific parameters: + + :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. + :type decoder_start_token_id: int + + :param pad_token_id: Padding token id. + :type pad_token_id: int + + :param translate_token_id: Translate token id. + :type translate_token_id: int + + :param transcribe_token_id: Transcribe token id. + :type transcribe_token_id: int + + :param no_timestamps_token_id: No timestamps token id. + :type no_timestamps_token_id: int + + :param is_multilingual: + :type is_multilingual: bool + + :param begin_suppress_tokens: A list containing tokens that will be suppressed at the beginning of the sampling process. + :type begin_suppress_tokens: list[int] + + :param suppress_tokens: A list containing the non-speech tokens that will be suppressed during generation. + :type suppress_tokens: list[int] + + :param language: Language token to use for generation in the form of <|en|>. + You can find all the possible language tokens in the generation_config.json lang_to_id dictionary. + :type language: Optional[str] + + :param lang_to_id: Language token to token_id map. Initialized from the generation_config.json lang_to_id dictionary. + :type lang_to_id: Dict[str, int] + + :param task: Task to use for generation, either “translate” or “transcribe” + :type task: int + + :param return_timestamps: If `true` the pipeline will return timestamps along the text for *segments* of words in the text. + For instance, if you get + WhisperDecodedResultChunk + start_ts = 0.5 + end_ts = 1.5 + text = " Hi there!" + then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. + Note that a segment of text refers to a sequence of one or more words, rather than individual words. + :type return_timestamps: bool + """ + def get_generation_config(self) -> WhisperGenerationConfig: + ... + def get_tokenizer(self) -> Tokenizer: + ... + def set_generation_config(self, config: WhisperGenerationConfig) -> None: + ... +class draft_model: + """ + This class is used to enable Speculative Decoding + """ + def __init__(self, models_path: os.PathLike, device: str = '', **kwargs) -> None: + """ + device on which inference will be performed + """ diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 99c3c4518e..772ba0af8a 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -20,6 +20,11 @@ using ov::genai::AggregationMode; using ov::genai::CacheEvictionConfig; using ov::genai::ContinuousBatchingPipeline; using ov::genai::GenerationResult; +using ov::genai::EncodedGenerationResult; +using ov::genai::GenerationHandleImpl; +using ov::genai::GenerationOutput; +using ov::genai::GenerationFinishReason; +using ov::genai::GenerationStatus; using ov::genai::SchedulerConfig; using ov::genai::PipelineMetrics; @@ -118,7 +123,7 @@ void init_continuous_batching_pipeline(py::module_& m) { .def(py::init<>()) .def_readonly("m_request_id", &GenerationResult::m_request_id) .def_property("m_generation_ids", - [](GenerationResult &r) -> py::list { + [](GenerationResult &r) -> py::typing::List { return pyutils::handle_utf8(r.m_generation_ids); }, [](GenerationResult &r, std::vector &generation_ids) { @@ -133,9 +138,59 @@ void init_continuous_batching_pipeline(py::module_& m) { } ) .def("get_generation_ids", - [](GenerationResult &r) -> py::list { + [](GenerationResult &r) -> py::typing::List { return pyutils::handle_utf8(r.m_generation_ids); }); + + py::class_(m, "EncodedGenerationResult", generation_result_docstring) + .def(py::init<>()) + .def_readonly("m_request_id", &EncodedGenerationResult::m_request_id) + .def_readwrite("m_generation_ids", &EncodedGenerationResult::m_generation_ids) + .def_readwrite("m_scores", &EncodedGenerationResult::m_scores); + + py::enum_(m, "GenerationStatus") + .value("RUNNING", ov::genai::GenerationStatus::RUNNING) + .value("FINISHED", ov::genai::GenerationStatus::FINISHED) + .value("IGNORED", ov::genai::GenerationStatus::IGNORED) + .value("DROPPED_BY_PIPELINE", ov::genai::GenerationStatus::DROPPED_BY_PIPELINE) + .value("DROPPED_BY_HANDLE", ov::genai::GenerationStatus::DROPPED_BY_HANDLE); + + py::enum_(m, "GenerationFinishReason") + .value("NONE", ov::genai::GenerationFinishReason::NONE) + .value("STOP", ov::genai::GenerationFinishReason::STOP) + .value("LENGTH", ov::genai::GenerationFinishReason::LENGTH); + + py::class_>(m, "GenerationOutput") + .def_readwrite("generated_ids", &GenerationOutput::generated_ids) + .def_readwrite("generated_log_probs", &GenerationOutput::generated_log_probs) + .def_readwrite("score", &GenerationOutput::score) + .def_readwrite("finish_reason", &GenerationOutput::finish_reason); + + py::class_>(m, "GenerationHandle") + .def("get_status", &GenerationHandleImpl::get_status) + .def("can_read", &GenerationHandleImpl::can_read) + .def("drop", &GenerationHandleImpl::drop) + .def("back", &GenerationHandleImpl::back) + .def("read", &GenerationHandleImpl::read) + .def("read_all", &GenerationHandleImpl::read_all); + + // Binding for StopCriteria + py::enum_(m, "AggregationMode", + R"(Represents the mode of per-token score aggregation when determining least important tokens for eviction from cache + :param AggregationMode.SUM: In this mode the importance scores of each token will be summed after each step of generation + :param AggregationMode.NORM_SUM: Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated) of a given token in cache)") + .value("SUM", AggregationMode::SUM) + .value("NORM_SUM", AggregationMode::NORM_SUM); + + py::class_(m, "CacheEvictionConfig", cache_eviction_config_docstring) + .def(py::init<>([](const size_t start_size, size_t recent_size, size_t max_cache_size, AggregationMode aggregation_mode) { + return CacheEvictionConfig{start_size, recent_size, max_cache_size, aggregation_mode}; }), + py::arg("start_size"), py::arg("recent_size"), py::arg("max_cache_size"), py::arg("aggregation_mode")) + .def_readwrite("aggregation_mode", &CacheEvictionConfig::aggregation_mode) + .def("get_start_size", &CacheEvictionConfig::get_start_size) + .def("get_recent_size", &CacheEvictionConfig::get_recent_size) + .def("get_max_cache_size", &CacheEvictionConfig::get_max_cache_size) + .def("get_evictable_size", &CacheEvictionConfig::get_evictable_size); py::class_(m, "SchedulerConfig", scheduler_config_docstring) .def(py::init<>()) @@ -148,20 +203,13 @@ void init_continuous_batching_pipeline(py::module_& m) { .def_readwrite("use_cache_eviction", &SchedulerConfig::use_cache_eviction) .def_readwrite("cache_eviction_config", &SchedulerConfig::cache_eviction_config); - py::class_(m, "CacheEvictionConfig", cache_eviction_config_docstring) - .def(py::init<>([](const size_t start_size, size_t recent_size, size_t max_cache_size, AggregationMode aggregation_mode) { - return CacheEvictionConfig{start_size, recent_size, max_cache_size, aggregation_mode}; }), - py::arg("start_size"), py::arg("recent_size"), py::arg("max_cache_size"), py::arg("aggregation_mode")) - .def_readwrite("aggregation_mode", &CacheEvictionConfig::aggregation_mode); - - // Binding for StopCriteria - py::enum_(m, "AggregationMode", - R"(Represents the mode of per-token score aggregation when determining least important tokens for eviction from cache - :param AggregationMode.SUM: In this mode the importance scores of each token will be summed after each step of generation - :param AggregationMode.NORM_SUM: Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated) of a given token in cache)") - .value("SUM", AggregationMode::SUM) - .value("NORM_SUM", AggregationMode::NORM_SUM) - .export_values(); + py::class_(m, "PipelineMetrics", pipeline_metrics_docstring) + .def(py::init<>()) + .def_readonly("requests", &PipelineMetrics::requests) + .def_readonly("scheduled_requests", &PipelineMetrics::scheduled_requests) + .def_readonly("cache_usage", &PipelineMetrics::cache_usage) + .def_readonly("avg_cache_usage", &PipelineMetrics::avg_cache_usage) + .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage); py::class_(m, "ContinuousBatchingPipeline", "This class is used for generation with LLMs with continuous batchig") .def(py::init([](const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, const std::map& tokenizer_plugin_config) { @@ -187,8 +235,8 @@ void init_continuous_batching_pipeline(py::module_& m) { .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) .def("get_config", &ContinuousBatchingPipeline::get_config) .def("get_metrics", &ContinuousBatchingPipeline::get_metrics) - .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request)) - .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request)) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("sampling_params")) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("sampling_params")) .def("step", &ContinuousBatchingPipeline::step) .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) .def( @@ -205,12 +253,4 @@ void init_continuous_batching_pipeline(py::module_& m) { py::arg("sampling_params"), py::arg("streamer") = std::monostate{} ); - - py::class_(m, "PipelineMetrics", pipeline_metrics_docstring) - .def(py::init<>()) - .def_readonly("requests", &PipelineMetrics::requests) - .def_readonly("scheduled_requests", &PipelineMetrics::scheduled_requests) - .def_readonly("cache_usage", &PipelineMetrics::cache_usage) - .def_readonly("avg_cache_usage", &PipelineMetrics::avg_cache_usage) - .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage); } diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index 8a6cc8c492..2979c137ae 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -76,8 +76,7 @@ void init_generation_config(py::module_& m) { py::enum_(m, "StopCriteria", stop_criteria_docstring) .value("EARLY", StopCriteria::EARLY) .value("HEURISTIC", StopCriteria::HEURISTIC) - .value("NEVER", StopCriteria::NEVER) - .export_values(); + .value("NEVER", StopCriteria::NEVER); // Binding for GenerationConfig py::class_(m, "GenerationConfig", generation_config_docstring) @@ -114,6 +113,9 @@ void init_generation_config(py::module_& m) { .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output) .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids) .def_readwrite("adapters", &GenerationConfig::adapters) - .def("set_eos_token_id", &GenerationConfig::set_eos_token_id) - .def("is_beam_search", &GenerationConfig::is_beam_search); -} + .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) + .def("is_beam_search", &GenerationConfig::is_beam_search) + .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) + .def("is_speculative_decoding", &GenerationConfig::is_speculative_decoding) + .def("update_generation_config", static_cast(&ov::genai::GenerationConfig::update_generation_config), py::arg("config_map")); + } diff --git a/src/python/py_image_generation_models.cpp b/src/python/py_image_generation_models.cpp index 5bcf8c6966..ffdaff1584 100644 --- a/src/python/py_image_generation_models.cpp +++ b/src/python/py_image_generation_models.cpp @@ -31,7 +31,7 @@ void init_clip_text_model(py::module_& m) { py::arg("root_dir"), "Model root directory", R"( CLIPTextModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. )") .def(py::init([]( const std::filesystem::path& root_dir, @@ -45,7 +45,7 @@ void init_clip_text_model(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( CLIPTextModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. )") @@ -65,15 +65,16 @@ void init_clip_text_model(py::module_& m) { const std::string& config_path ) { return std::make_unique(config_path); - })) + }), + py::arg("config_path")) .def_readwrite("max_position_embeddings", &ov::genai::CLIPTextModel::Config::max_position_embeddings) .def_readwrite("num_hidden_layers", &ov::genai::CLIPTextModel::Config::num_hidden_layers); clip_text_model.def("get_config", &ov::genai::CLIPTextModel::get_config); - clip_text_model.def("reshape", &ov::genai::CLIPTextModel::reshape); - clip_text_model.def("set_adapters", &ov::genai::CLIPTextModel::set_adapters); - clip_text_model.def("infer", &ov::genai::CLIPTextModel::infer); - clip_text_model.def("get_output_tensor", &ov::genai::CLIPTextModel::get_output_tensor); + clip_text_model.def("reshape", &ov::genai::CLIPTextModel::reshape, py::arg("batch_size")); + clip_text_model.def("set_adapters", &ov::genai::CLIPTextModel::set_adapters, py::arg("adapters")); + clip_text_model.def("infer", &ov::genai::CLIPTextModel::infer, py::arg("pos_prompt"), py::arg("neg_prompt"), py::arg("do_classifier_free_guidance")); + clip_text_model.def("get_output_tensor", &ov::genai::CLIPTextModel::get_output_tensor, py::arg("idx")); clip_text_model.def( "compile", [](ov::genai::CLIPTextModel& self, @@ -100,7 +101,7 @@ void init_unet2d_condition_model(py::module_& m) { py::arg("root_dir"), "Model root directory", R"( UNet2DConditionModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. )") .def(py::init([]( const std::filesystem::path& root_dir, @@ -113,7 +114,7 @@ void init_unet2d_condition_model(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( UNet2DConditionModel class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. )") @@ -133,16 +134,17 @@ void init_unet2d_condition_model(py::module_& m) { const std::filesystem::path& config_path ) { return std::make_unique(config_path); - })) + }), + py::arg("config_path")) .def_readwrite("in_channels", &ov::genai::UNet2DConditionModel::Config::in_channels) .def_readwrite("sample_size", &ov::genai::UNet2DConditionModel::Config::sample_size) .def_readwrite("time_cond_proj_dim", &ov::genai::UNet2DConditionModel::Config::time_cond_proj_dim); unet2d_condition_model.def("get_config", &ov::genai::UNet2DConditionModel::get_config); - unet2d_condition_model.def("reshape", &ov::genai::UNet2DConditionModel::reshape); - unet2d_condition_model.def("set_adapters", &ov::genai::UNet2DConditionModel::set_adapters); - unet2d_condition_model.def("infer", &ov::genai::UNet2DConditionModel::infer); - unet2d_condition_model.def("set_hidden_states", &ov::genai::UNet2DConditionModel::set_hidden_states); + unet2d_condition_model.def("reshape", &ov::genai::UNet2DConditionModel::reshape, py::arg("batch_size"), py::arg("height"), py::arg("width"), py::arg("tokenizer_model_max_length")); + unet2d_condition_model.def("set_adapters", &ov::genai::UNet2DConditionModel::set_adapters, py::arg("adapters")); + unet2d_condition_model.def("infer", &ov::genai::UNet2DConditionModel::infer, py::arg("sample"), py::arg("timestep")); + unet2d_condition_model.def("set_hidden_states", &ov::genai::UNet2DConditionModel::set_hidden_states, py::arg("tensor_name"), py::arg("encoder_hidden_states")); unet2d_condition_model.def( "compile", [](ov::genai::UNet2DConditionModel& self, @@ -169,7 +171,7 @@ void init_autoencoder_kl(py::module_& m) { py::arg("vae_decoder_path"), "VAE decoder directory", R"( AutoencoderKL class initialized only with decoder model. - vae_decoder_path (str): VAE decoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. )") .def(py::init([]( const std::filesystem::path& vae_encoder_path, @@ -181,8 +183,8 @@ void init_autoencoder_kl(py::module_& m) { py::arg("vae_decoder_path"), "VAE decoder directory", R"( AutoencoderKL class initialized with both encoder and decoder models. - vae_encoder_path (str): VAE encoder directory. - vae_decoder_path (str): VAE decoder directory. + vae_encoder_path (os.PathLike): VAE encoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. )") .def(py::init([]( const std::filesystem::path& vae_decoder_path, @@ -195,7 +197,7 @@ void init_autoencoder_kl(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( AutoencoderKL class initialized only with decoder model. - vae_decoder_path (str): VAE decoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. device (str): Device on which inference will be done. kwargs: Device properties. )") @@ -212,8 +214,8 @@ void init_autoencoder_kl(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( AutoencoderKL class initialized only with both encoder and decoder models. - vae_encoder_path (str): VAE encoder directory. - vae_decoder_path (str): VAE decoder directory. + vae_encoder_path (os.PathLike): VAE encoder directory. + vae_decoder_path (os.PathLike): VAE decoder directory. device (str): Device on which inference will be done. kwargs: Device properties. )") @@ -233,14 +235,15 @@ void init_autoencoder_kl(py::module_& m) { const std::filesystem::path& config_path ) { return std::make_unique(config_path); - })) + }), + py::arg("config_path")) .def_readwrite("in_channels", &ov::genai::AutoencoderKL::Config::in_channels) .def_readwrite("latent_channels", &ov::genai::AutoencoderKL::Config::latent_channels) .def_readwrite("out_channels", &ov::genai::AutoencoderKL::Config::out_channels) .def_readwrite("scaling_factor", &ov::genai::AutoencoderKL::Config::scaling_factor) .def_readwrite("block_out_channels", &ov::genai::AutoencoderKL::Config::block_out_channels); - autoencoder_kl.def("reshape", &ov::genai::AutoencoderKL::reshape); + autoencoder_kl.def("reshape", &ov::genai::AutoencoderKL::reshape, py::arg("batch_size"), py::arg("height"), py::arg("width")); autoencoder_kl.def( "compile", [](ov::genai::AutoencoderKL& self, @@ -255,8 +258,8 @@ void init_autoencoder_kl(py::module_& m) { device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Device properties. )"); - autoencoder_kl.def("decode", &ov::genai::AutoencoderKL::decode); - autoencoder_kl.def("encode", &ov::genai::AutoencoderKL::encode); + autoencoder_kl.def("decode", &ov::genai::AutoencoderKL::decode, py::arg("latent")); + autoencoder_kl.def("encode", &ov::genai::AutoencoderKL::encode, py::arg("image")); autoencoder_kl.def("get_config", &ov::genai::AutoencoderKL::get_config); autoencoder_kl.def("get_vae_scale_factor", &ov::genai::AutoencoderKL::get_vae_scale_factor); } @@ -272,7 +275,7 @@ void init_clip_text_model_with_projection(py::module_& m) { py::arg("root_dir"), "Model root directory", R"( CLIPTextModelWithProjection class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. )") .def(py::init([]( const std::filesystem::path& root_dir, @@ -286,7 +289,7 @@ void init_clip_text_model_with_projection(py::module_& m) { py::arg("device"), "Device on which inference will be done", R"( CLIPTextModelWithProjection class - root_dir (str): Model root directory. + root_dir (os.PathLike): Model root directory. device (str): Device on which inference will be done. kwargs: Device properties. )") @@ -306,14 +309,16 @@ void init_clip_text_model_with_projection(py::module_& m) { const std::filesystem::path& config_path ) { return std::make_unique(config_path); - })) + }), + py::arg("config_path")) .def_readwrite("max_position_embeddings", &ov::genai::CLIPTextModelWithProjection::Config::max_position_embeddings) .def_readwrite("num_hidden_layers", &ov::genai::CLIPTextModelWithProjection::Config::num_hidden_layers); - clip_text_model_with_projection.def("reshape", &ov::genai::CLIPTextModelWithProjection::reshape); - clip_text_model_with_projection.def("infer", &ov::genai::CLIPTextModelWithProjection::infer); + clip_text_model_with_projection.def("reshape", &ov::genai::CLIPTextModelWithProjection::reshape, py::arg("batch_size")); + clip_text_model_with_projection.def("infer", &ov::genai::CLIPTextModelWithProjection::infer, py::arg("pos_prompt"), py::arg("neg_prompt"), py::arg("do_classifier_free_guidance")); clip_text_model_with_projection.def("get_config", &ov::genai::CLIPTextModelWithProjection::get_config); - clip_text_model_with_projection.def("get_output_tensor", &ov::genai::CLIPTextModelWithProjection::get_config); + clip_text_model_with_projection.def("get_output_tensor", &ov::genai::CLIPTextModelWithProjection::get_output_tensor, py::arg("idx")); + clip_text_model_with_projection.def("set_adapters", &ov::genai::CLIPTextModelWithProjection::set_adapters, py::arg("adapters")); clip_text_model_with_projection.def( "compile", [](ov::genai::CLIPTextModelWithProjection& self, diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index 16a66cd84d..6e46ba59e0 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -171,13 +171,6 @@ void init_unet2d_condition_model(py::module_& m); void init_autoencoder_kl(py::module_& m); void init_image_generation_pipelines(py::module_& m) { - - // init image generation models - init_clip_text_model(m); - init_clip_text_model_with_projection(m); - init_unet2d_condition_model(m); - init_autoencoder_kl(m); - py::class_>(m, "Generator", "This class is used for storing pseudo-random generator.") .def(py::init<>()); @@ -186,9 +179,29 @@ void init_image_generation_pipelines(py::module_& m) { uint32_t seed ) { return std::make_unique(seed); - })) + }), + py::arg("seed")) .def("next", &ov::genai::CppStdGenerator::next) - .def("randn_tensor", &ov::genai::CppStdGenerator::randn_tensor); + .def("randn_tensor", &ov::genai::CppStdGenerator::randn_tensor, py::arg("shape")); + + // init image generation models + init_clip_text_model(m); + init_clip_text_model_with_projection(m); + init_unet2d_condition_model(m); + init_autoencoder_kl(m); + + auto image_generation_scheduler = py::class_>(m, "Scheduler", "Scheduler for image generation pipelines."); + py::enum_(image_generation_scheduler, "Type") + .value("AUTO", ov::genai::Scheduler::Type::AUTO) + .value("LCM", ov::genai::Scheduler::Type::LCM) + .value("LMS_DISCRETE", ov::genai::Scheduler::Type::LMS_DISCRETE) + .value("DDIM", ov::genai::Scheduler::Type::DDIM) + .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE) + .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE); + image_generation_scheduler.def_static("from_config", + &ov::genai::Scheduler::from_config, + py::arg("scheduler_config_path"), + py::arg_v("scheduler_type", ov::genai::Scheduler::Type::AUTO, "Scheduler.Type.AUTO")); py::class_(m, "ImageGenerationConfig", "This class is used for storing generation config for image generation pipeline.") .def(py::init<>()) @@ -222,7 +235,7 @@ void init_image_generation_pipelines(py::module_& m) { py::arg("models_path"), "folder with exported model files.", R"( Text2ImagePipeline class constructor. - models_path (str): Path to the folder with exported model files. + models_path (os.PathLike): Path to the folder with exported model files. )") .def(py::init([]( @@ -237,17 +250,17 @@ void init_image_generation_pipelines(py::module_& m) { py::arg("device"), "device on which inference will be done", R"( Text2ImagePipeline class constructor. - models_path (str): Path with exported model files. + models_path (os.PathLike): Path with exported model files. device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Text2ImagePipeline properties )") .def("get_generation_config", &ov::genai::Text2ImagePipeline::get_generation_config) - .def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config) - .def("set_scheduler", &ov::genai::Text2ImagePipeline::set_scheduler) - .def("reshape", &ov::genai::Text2ImagePipeline::reshape) - .def("stable_diffusion", &ov::genai::Text2ImagePipeline::stable_diffusion) - .def("latent_consistency_model", &ov::genai::Text2ImagePipeline::latent_consistency_model) - .def("stable_diffusion_xl", &ov::genai::Text2ImagePipeline::stable_diffusion_xl) + .def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config, py::arg("generation_config")) + .def("set_scheduler", &ov::genai::Text2ImagePipeline::set_scheduler, py::arg("scheduler")) + .def("reshape", &ov::genai::Text2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale")) + .def_static("stable_diffusion", &ov::genai::Text2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) + .def_static("latent_consistency_model", &ov::genai::Text2ImagePipeline::latent_consistency_model, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) + .def_static("stable_diffusion_xl", &ov::genai::Text2ImagePipeline::stable_diffusion_xl, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("clip_text_model_with_projection"), py::arg("unet"), py::arg("vae")) .def( "compile", [](ov::genai::Text2ImagePipeline& pipe, @@ -267,22 +280,11 @@ void init_image_generation_pipelines(py::module_& m) { [](ov::genai::Text2ImagePipeline& pipe, const std::string& prompt, const py::kwargs& kwargs - ) { + ) -> py::typing::Union { ov::AnyMap params = text2image_kwargs_to_any_map(kwargs, false); return py::cast(pipe.generate(prompt, params)); }, py::arg("prompt"), "Input string", (text2image_generate_docstring + std::string(" \n ")).c_str() ); - - auto image_generation_scheduler = py::class_>(m, "Scheduler", "Scheduler for image generation pipelines.") - .def("from_config", &ov::genai::Scheduler::from_config); - - py::enum_(image_generation_scheduler, "Type") - .value("AUTO", ov::genai::Scheduler::Type::AUTO) - .value("LCM", ov::genai::Scheduler::Type::LCM) - .value("LMS_DISCRETE", ov::genai::Scheduler::Type::LMS_DISCRETE) - .value("DDIM", ov::genai::Scheduler::Type::DDIM) - .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE) - .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE); } diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp index ad85ca6788..7255022238 100644 --- a/src/python/py_llm_pipeline.cpp +++ b/src/python/py_llm_pipeline.cpp @@ -122,7 +122,7 @@ void init_llm_pipeline(py::module_& m) { py::arg("config") = ov::AnyMap({}), "openvino.properties map", R"( LLMPipeline class constructor for manually created openvino_genai.Tokenizer. - models_path (str): Path to the model file. + models_path (os.PathLike): Path to the model file. tokenizer (openvino_genai.Tokenizer): tokenizer object. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. @@ -151,7 +151,7 @@ void init_llm_pipeline(py::module_& m) { py::arg("config") = ov::AnyMap({}), "openvino.properties map", R"( LLMPipeline class constructor. - models_path (str): Path to the model file. + models_path (os.PathLike): Path to the model file. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. kwargs: Device properties. @@ -164,7 +164,7 @@ void init_llm_pipeline(py::module_& m) { const OptionalGenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs - ) { + ) -> py::typing::Union { return call_common_generate(pipe, inputs, generation_config, streamer, kwargs); }, py::arg("inputs"), "Input string, or list of string or encoded tokens", @@ -180,7 +180,7 @@ void init_llm_pipeline(py::module_& m) { const OptionalGenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs - ) { + ) -> py::typing::Union { return call_common_generate(pipe, inputs, generation_config, streamer, kwargs); }, py::arg("inputs"), "Input string, or list of string or encoded tokens", @@ -193,7 +193,7 @@ void init_llm_pipeline(py::module_& m) { .def("start_chat", &LLMPipeline::start_chat, py::arg("system_message") = "") .def("finish_chat", &LLMPipeline::finish_chat) .def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy) - .def("set_generation_config", &LLMPipeline::set_generation_config); + .def("set_generation_config", &LLMPipeline::set_generation_config, py::arg("config")); py::class_(m, "draft_model", py::module_local(), "This class is used to enable Speculative Decoding") .def(py::init([]( diff --git a/src/python/py_lora_adapter.cpp b/src/python/py_lora_adapter.cpp index 751e950ada..7f98b67064 100644 --- a/src/python/py_lora_adapter.cpp +++ b/src/python/py_lora_adapter.cpp @@ -23,7 +23,7 @@ void init_lora_adapter(py::module_& m) { py::arg("path"), "path", R"( Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. - path (str): Path to adapter file in safetensors format. + path (os.PathLike): Path to adapter file in safetensors format. )") .def( "__bool__", @@ -44,7 +44,7 @@ void init_lora_adapter(py::module_& m) { ov::genai::AdapterConfig::Mode mode) { return std::make_unique(mode); }), - py::arg("mode") = ov::genai::AdapterConfig::Mode::MODE_AUTO); + py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def(py::init([]( const ov::genai::Adapter& adapter, @@ -54,7 +54,7 @@ void init_lora_adapter(py::module_& m) { }), py::arg("adapter"), py::arg("alpha"), - py::arg("mode") = ov::genai::AdapterConfig::Mode::MODE_AUTO); + py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def(py::init([]( const ov::genai::Adapter& adapter, @@ -62,7 +62,7 @@ void init_lora_adapter(py::module_& m) { return std::make_unique(adapter, mode); }), py::arg("adapter"), - py::arg("mode") = ov::genai::AdapterConfig::Mode::MODE_AUTO); + py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def(py::init([]( const std::vector& adapters, @@ -70,7 +70,7 @@ void init_lora_adapter(py::module_& m) { return std::make_unique(adapters, mode); }), py::arg("adapters"), - py::arg("mode") = ov::genai::AdapterConfig::Mode::MODE_AUTO); + py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def(py::init([]( const std::vector>& adapters, @@ -78,7 +78,7 @@ void init_lora_adapter(py::module_& m) { return std::make_unique(adapters, mode); }), py::arg("adapters"), - py::arg("mode") = ov::genai::AdapterConfig::Mode::MODE_AUTO); + py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def( "__bool__", [](ov::genai::AdapterConfig& self @@ -86,10 +86,10 @@ void init_lora_adapter(py::module_& m) { return bool(self); }); - adapter_config.def("set_alpha", &ov::genai::AdapterConfig::set_alpha); - adapter_config.def("get_alpha", &ov::genai::AdapterConfig::get_alpha); - adapter_config.def("remove", &ov::genai::AdapterConfig::remove); + adapter_config.def("set_alpha", &ov::genai::AdapterConfig::set_alpha, py::arg("adapter"), py::arg("alpha")); + adapter_config.def("get_alpha", &ov::genai::AdapterConfig::get_alpha, py::arg("adapter")); + adapter_config.def("remove", &ov::genai::AdapterConfig::remove, py::arg("adapter")); adapter_config.def("get_adapters", &ov::genai::AdapterConfig::get_adapters); - adapter_config.def("add", static_cast(&ov::genai::AdapterConfig::add)); - adapter_config.def("add", static_cast(&ov::genai::AdapterConfig::add)); + adapter_config.def("add", static_cast(&ov::genai::AdapterConfig::add), py::arg("adapter"), py::arg("alpha")); + adapter_config.def("add", static_cast(&ov::genai::AdapterConfig::add), py::arg("adapter")); } diff --git a/src/python/py_openvino_genai.cpp b/src/python/py_openvino_genai.cpp index 23391db98f..e821c1cfdc 100644 --- a/src/python/py_openvino_genai.cpp +++ b/src/python/py_openvino_genai.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "openvino/genai/llm_pipeline.hpp" @@ -82,9 +83,10 @@ class ConstructableStreamer: public StreamerBase { PYBIND11_MODULE(py_openvino_genai, m) { m.doc() = "Pybind11 binding for OpenVINO GenAI library"; + init_perf_metrics(m); py::class_(m, "DecodedResults", decoded_results_docstring) .def(py::init<>()) - .def_property_readonly("texts", [](const DecodedResults &dr) { return pyutils::handle_utf8((std::vector)dr); }) + .def_property_readonly("texts", [](const DecodedResults &dr) -> py::typing::List { return pyutils::handle_utf8((std::vector)dr); }) .def_readonly("scores", &DecodedResults::scores) .def_readonly("perf_metrics", &DecodedResults::perf_metrics) .def("__str__", [](const DecodedResults &dr) -> py::str { @@ -107,13 +109,12 @@ PYBIND11_MODULE(py_openvino_genai, m) { py::class_>(m, "StreamerBase", streamer_base_docstring) // Change the holder form unique_ptr to shared_ptr .def(py::init<>()) - .def("put", &StreamerBase::put, "Put is called every time new token is decoded. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops") + .def("put", &StreamerBase::put, "Put is called every time new token is decoded. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops", py::arg("token")) .def("end", &StreamerBase::end, "End is called at the end of generation. It can be used to flush cache if your own streamer has one"); init_tokenizer(m); - init_perf_metrics(m); - init_generation_config(m); init_lora_adapter(m); + init_generation_config(m); init_continuous_batching_pipeline(m); init_llm_pipeline(m); diff --git a/src/python/py_perf_metrics.cpp b/src/python/py_perf_metrics.cpp index 679acc2b9d..1d37784e27 100644 --- a/src/python/py_perf_metrics.cpp +++ b/src/python/py_perf_metrics.cpp @@ -171,7 +171,7 @@ void init_perf_metrics(py::module_& m) { .def("get_inference_duration", &PerfMetrics::get_inference_duration) .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration) .def("get_detokenization_duration", &PerfMetrics::get_detokenization_duration) - .def("__add__", &PerfMetrics::operator+) - .def("__iadd__", &PerfMetrics::operator+=) + .def("__add__", &PerfMetrics::operator+, py::arg("metrics")) + .def("__iadd__", &PerfMetrics::operator+=, py::arg("right")) .def_readonly("raw_metrics", &PerfMetrics::raw_metrics); } diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index e53c0c80cf..b3c52cd28b 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -22,7 +22,7 @@ using ov::genai::Tokenizer; void init_tokenizer(py::module_& m) { py::class_(m, "TokenizedInputs") - .def(py::init()) + .def(py::init(), py::arg("input_ids"), py::arg("attention_mask")) .def_readwrite("input_ids", &TokenizedInputs::input_ids) .def_readwrite("attention_mask", &TokenizedInputs::attention_mask); @@ -63,7 +63,7 @@ void init_tokenizer(py::module_& m) { .def( "decode", - [](Tokenizer& tok, ov::Tensor& tokens) -> py::list { + [](Tokenizer& tok, ov::Tensor& tokens) -> py::typing::List { return pyutils::handle_utf8(tok.decode(tokens)); }, py::arg("tokens"), @@ -71,7 +71,7 @@ void init_tokenizer(py::module_& m) { .def( "decode", - [](Tokenizer& tok, std::vector>& tokens) -> py::list{ + [](Tokenizer& tok, std::vector>& tokens) -> py::typing::List { return pyutils::handle_utf8(tok.decode(tokens)); }, py::arg("tokens"), diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index e26585dd53..423ef9b384 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -24,8 +24,8 @@ auto vlm_generate_docstring = R"( :param prompt: input prompt :type prompt: str - :param images: list of images - :type inputs: List[ov.Tensor] + :param images: image or list of images + :type images: List[ov.Tensor] or ov.Tensor :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict @@ -128,17 +128,17 @@ void init_vlm_pipeline(py::module_& m) { py::arg("device"), "device on which inference will be done" R"( VLMPipeline class constructor. - models_path (str): Path to the folder with exported model files. + models_path (os.PathLike): Path to the folder with exported model files. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. kwargs: Device properties )") .def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "") .def("finish_chat", &ov::genai::VLMPipeline::finish_chat) - .def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template) + .def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template, py::arg("new_template")) .def("get_tokenizer", &ov::genai::VLMPipeline::get_tokenizer) .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config) - .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config) + .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config, py::arg("new_config")) .def( "generate", [](ov::genai::VLMPipeline& pipe, @@ -147,12 +147,29 @@ void init_vlm_pipeline(py::module_& m) { const ov::genai::GenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs - ) { + ) -> py::typing::Union { return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); }, py::arg("prompt"), "Input string", py::arg("images"), "Input images", - py::arg("generation_config") = std::nullopt, "generation_config", + py::arg("generation_config"), "generation_config", + py::arg("streamer") = std::monostate(), "streamer", + (vlm_generate_docstring + std::string(" \n ")).c_str() + ) + .def( + "generate", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const ov::Tensor& images, + const ov::genai::GenerationConfig& generation_config, + const pyutils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs + ) -> py::typing::Union { + return call_vlm_generate(pipe, prompt, {images}, generation_config, streamer, kwargs); + }, + py::arg("prompt"), "Input string", + py::arg("images"), "Input images", + py::arg("generation_config"), "generation_config", py::arg("streamer") = std::monostate(), "streamer", (vlm_generate_docstring + std::string(" \n ")).c_str() ) @@ -161,7 +178,7 @@ void init_vlm_pipeline(py::module_& m) { [](ov::genai::VLMPipeline& pipe, const std::string& prompt, const py::kwargs& kwargs - ) { + ) -> py::typing::Union { return py::cast(pipe.generate(prompt, vlm_kwargs_to_any_map(kwargs, false))); }, py::arg("prompt"), "Input string", diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index c4eea6809b..29f98c7dd6 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -235,7 +235,7 @@ void init_whisper_pipeline(py::module_& m) { .def_readwrite("lang_to_id", &WhisperGenerationConfig::lang_to_id) .def_readwrite("task", &WhisperGenerationConfig::task) .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps) - .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id); + .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")); py::class_(m, "WhisperDecodedResultChunk", whisper_decoded_result_chunk) .def(py::init<>()) @@ -262,7 +262,7 @@ void init_whisper_pipeline(py::module_& m) { "openvino.properties map", R"( WhisperPipeline class constructor. - models_path (str): Path to the model file. + models_path (os.PathLike): Path to the model file. device (str): Device to run the model on (e.g., CPU, GPU). )") @@ -272,7 +272,7 @@ void init_whisper_pipeline(py::module_& m) { const RawSpeechInput& raw_speech_input, const OptionalWhisperGenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& streamer, - const py::kwargs& kwargs) { + const py::kwargs& kwargs) -> py::typing::Union { return call_whisper_common_generate(pipe, raw_speech_input, generation_config, streamer, kwargs); }, py::arg("raw_speech_input"), @@ -286,5 +286,5 @@ void init_whisper_pipeline(py::module_& m) { .def("get_tokenizer", &WhisperPipeline::get_tokenizer) .def("get_generation_config", &WhisperPipeline::get_generation_config, py::return_value_policy::copy) - .def("set_generation_config", &WhisperPipeline::set_generation_config); + .def("set_generation_config", &WhisperPipeline::set_generation_config, py::arg("config")); }