Port from master (#1285)

- #1158 - #1178 - #1214 - #1243 - #1253 - #1259 - #1266 - #1271 - #1278 - #1280 - #1284 - e4a86f6 - #1246 - #958 --------- Co-authored-by: Anastasiia Pnevskaia <[email protected]> Co-authored-by: Helena Kloosterman <[email protected]> Co-authored-by: Vladimir Zlobin <[email protected]> Co-authored-by: Dmitry Matveev <[email protected]> Co-authored-by: Anna Likholat <[email protected]> Co-authored-by: Alina Kladieva <[email protected]>
openvinotoolkit · Dec 4, 2024 · 8bef5a3 · 8bef5a3
1 parent e42723a
commit 8bef5a3
Show file tree

Hide file tree

Showing 35 changed files with 2,245 additions and 208 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -36,7 +36,7 @@
 - 'tests/cpp/generate_config.cpp'
 - 'tests/cpp/sampler.cpp'
 
-- 'category: LoRA':
+'category: LoRA':
 - 'src/cpp/include/openvino/genai/lora_adapter.hpp'
 - 'src/cpp/src/lora_adapter.cpp'
 - 'src/cpp/src/lora_helper.cpp'

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -243,7 +243,7 @@ jobs:
       - name: Test bindings (wheel)
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          python -m pip install . --verbose
+          python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
 
   genai_python_lib_whisper:
@@ -307,7 +307,7 @@ jobs:
       - name: Test bindings (wheel)
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          python -m pip install . --verbose
+          python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
 
   genai_python_lib_vlm:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,26 +25,45 @@ if(POLICY CMP0169)
     cmake_policy(SET CMP0169 OLD)
 endif()
 
+if(UNIX AND NOT (APPLE OR ANDROID OR CYGWIN))
+    set(LINUX ON)
+endif()
+
 project(OpenVINOGenAI
         VERSION 2024.5.0.0
         DESCRIPTION "OpenVINO GenAI"
         HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai"
         LANGUAGES CXX C)
 
+if(NOT DEFINED Python3_FIND_VIRTUALENV)
+    set(Python3_FIND_VIRTUALENV FIRST)
+endif()
+
+# Looking for OpenVINO in the python distribution. It doesn't work for cross-compiling build
+if(NOT CMAKE_CROSSCOMPILING)
+    find_package(Python3 REQUIRED)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "from openvino.utils import get_cmake_path; print(get_cmake_path(), end='')"
+        OUTPUT_VARIABLE OpenVINO_DIR_PY
+        ERROR_QUIET
+    )
+endif()
+
 # Find OpenVINODeveloperPackage first to compile with SDL flags
 find_package(OpenVINODeveloperPackage ${OpenVINOGenAI_VERSION} QUIET
              COMPONENTS Runtime Threading
              PATHS "${OpenVINO_DIR}")
 if(NOT OpenVINODeveloperPackage_FOUND)
     find_package(OpenVINO ${OpenVINOGenAI_VERSION} REQUIRED
-                 COMPONENTS Runtime Threading)
+                 COMPONENTS Runtime Threading
+                 PATHS "${OpenVINO_DIR_PY}")
 endif()
 
 include(cmake/features.cmake)
 
 if(ENABLE_PYTHON)
     # the following two calls are required for cross-compilation
-    if(OpenVINODeveloperPackage_DIR)
+    if(OpenVINODeveloperPackage_FOUND)
         ov_find_python3(REQUIRED)
         ov_detect_python_module_extension()
     else()
@@ -62,9 +81,15 @@ endif()
 
 add_subdirectory(thirdparty)
 add_subdirectory(src)
-add_subdirectory(samples)
-add_subdirectory(tools/continuous_batching)
-add_subdirectory(tests/cpp)
+if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/samples")
+    add_subdirectory(samples)
+endif()
+if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/tools/continuous_batching")
+    add_subdirectory(tools/continuous_batching)
+endif()
+if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/tests/cpp")
+    add_subdirectory(tests/cpp)
+endif()
 
 install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
 install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)

diff --git a/README.md b/README.md
@@ -117,17 +117,34 @@ optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --
 
 ### Run generation using VLMPipeline API in Python
 
+See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/visual_language_chat) for a demo application.
+
+Run the following command to download a sample image:
+
+```sh
+curl -O "https://storage.openvinotoolkit.org/test_data/images/dog.jpg"
+```
+
 ```python
+import numpy as np
+import openvino as ov
 import openvino_genai as ov_genai
-#Will run model on CPU, GPU is a possible option
+from PIL import Image
+
+# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
 pipe = ov_genai.VLMPipeline("./MiniCPM-V-2_6/", "CPU")
-rgb = read_image("cat.jpg")
-print(pipe.generate(prompt, image=rgb, max_new_tokens=100))
+
+image = Image.open("dog.jpg")
+image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
+image_data = ov.Tensor(image_data)  
+
+prompt = "Can you describe the image?"
+print(pipe.generate(prompt, image=image_data, max_new_tokens=100))
 ```
 
 ### Run generation using VLMPipeline in C++
 
-Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details)
+Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details). See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/visual_language_chat) for a demo application.
 
 ```cpp
 #include "load_image.hpp"

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,16 +3,31 @@ name = "openvino-genai"
 version = "2024.5.0.0"
 description = "Library of the most popular Generative AI model pipelines, optimized execution methods, and samples"
 requires-python = ">=3.9"
-readme = {file = "src/README.md", content-type="text/markdown"}
-license = {text = "OSI Approved :: Apache Software License"}
+readme = { file = "src/README.md", content-type="text/markdown" }
+license = { "file" = "LICENSE" }
 authors = [
     { name = "OpenVINO Developers", email = "[email protected]" },
 ]
 classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries :: Python Modules",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Operating System :: Unix",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: Microsoft :: Windows",
+    "Operating System :: MacOS",
+    "Programming Language :: C++",
+    "Programming Language :: C",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: Implementation :: CPython"
 ]
 dependencies = [
     "openvino_tokenizers~=2024.5.0.0.dev"
@@ -22,22 +37,24 @@ dependencies = [
 directory = "src/python"
 
 [tool.py-build-cmake.sdist]
-exclude = ["tools", "samples", "tests", "thirdparty"]
+include = ["CMakeLists.txt", "LICENSE", "third-party-programs.txt", "SECURITY.md", "cmake", "src", "thirdparty"]
 
 [tool.py-build-cmake.cmake]
 minimum_version = "3.23"
 build_type = "Release"
 config = ["Release"]
 find_python3 = true
-build_args = ["--parallel", "--target", "py_openvino_genai"]
+build_args = ["--parallel", "--target", "py_openvino_genai_stub"]
 install_args = ["--strip"]
 install_components = ["wheel_genai"]
 options = {"BUILD_TOKENIZERS" = "OFF"}
 
 [build-system]
 requires = [
-    "py-build-cmake@git+https://github.com/tttapa/py-build-cmake@7ab73da351c7140f06d727a8705bece4cf544cd9",
-    "cmake~=3.23"
+    "py-build-cmake==0.3.1",
+    "pybind11-stubgen==2.5.1",
+    "openvino~=2024.5.0.0.dev",
+    "cmake~=3.23.0"
 ]
 build-backend = "py_build_cmake.build"
 

diff --git a/requirements-build.txt b/requirements-build.txt
@@ -1 +1,2 @@
-cmake~=3.30
+cmake~=3.23.0
+pybind11-stubgen==2.5.1
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
@@ -96,6 +96,21 @@ else()
         SOVERSION ${MAJOR_SUFFIX}${OpenVINOGenAI_VERSION_MINOR}${OpenVINOGenAI_VERSION_PATCH})
 endif()
 
+if(OpenVINODeveloperPackage_FOUND)
+    # must be called after all target_link_libraries
+    # ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+
+    ov_ncc_naming_style(FOR_TARGET ${TARGET_NAME}
+                        SOURCE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/include")
+
+    # TODO: override versions as currently they come from OpenVINO
+    # ov_add_vs_version_file(NAME ${TARGET_NAME}
+    #                        FILEDESCRIPTION "OpenVINO GenAI library")
+
+    # TODO: commit changes separately
+    # ov_add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+endif()
+
 # - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
 # - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
 # - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`

diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp
@@ -99,6 +99,19 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
         const StreamerVariant& streamer
     );
 
+    /// @brief Generate a response given a prompt and uint8 RGB image with [NHWC] or [HWC] layout.
+    /// @param prompt A prompt to respond to.
+    /// @param image Image to be prepended to a prompt.
+    /// @param generation_config A config to follow for text generation.
+    /// @param streamer A streamer to acquire intermediate result.
+    /// @return A string generated by a model.
+    DecodedResults generate(
+        const std::string& prompt,
+        const ov::Tensor& rgb,
+        const GenerationConfig& generation_config,
+        const StreamerVariant& streamer
+    );
+
     /// @brief Generate a response given a prompt and config.
     /// @param prompt A prompt to respond to.
     /// @param config_map A config may contain GenerationConfig, values

diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp
@@ -12,7 +12,6 @@
 
 #include "sequence_group.hpp"
 
-
 namespace ov::genai {
 
 class KVCacheBlock {
@@ -188,7 +187,10 @@ class CacheStateDumper;
  */
 class BlockAllocator {
     std::vector<std::list<KVCacheBlock::Ptr>> m_free_blocks;
-    int m_total_num_blocks;
+    // We keep m_free_blocks_num instead of m_free_blocks[X].size() to WA old CXX library implementation issue for std::list::size()
+    // see https://stackoverflow.com/questions/13157164/why-isnt-stdlist-size-constant-time
+    std::vector<size_t> m_free_blocks_num;
+    size_t m_total_num_blocks;
     friend class CacheStateDumper;
     size_t m_num_layers;
     bool m_enable_prefix_caching;
@@ -202,8 +204,8 @@ class BlockAllocator {
      * @param num_layers The number of separate attention layers with KV caches in the LLM associated with the pipeline.
      * Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache.
      */
-    BlockAllocator(int num_blocks, bool enable_prefix_caching, size_t num_layers = 1) :
-            m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
+    BlockAllocator(size_t num_blocks, bool enable_prefix_caching, size_t num_layers = 1) :
+            m_free_blocks_num(num_layers, num_blocks), m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
         OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero");
         m_free_blocks.resize(m_num_layers);
         for (auto& per_layer_block_list : m_free_blocks) {
@@ -224,7 +226,7 @@ class BlockAllocator {
      * @return Number of free blocks for this layer.
      */
     size_t num_free_blocks(size_t layer_idx) const {
-        return m_free_blocks[layer_idx].size() + m_overwriteable_blocks.num_blocks();
+        return m_free_blocks_num[layer_idx] + num_overwriteable_blocks();
     }
 
     /**
@@ -270,6 +272,7 @@ class BlockAllocator {
         block_ptr->release();
         if (block_ptr->is_free()) {
             m_free_blocks[layer_idx].push_back(block_ptr);
+            ++m_free_blocks_num[layer_idx];
         }
     }
 
@@ -325,6 +328,7 @@ class BlockAllocator {
                         // actual collision case
                         for (size_t layer_idx = 0; layer_idx < colliding_blocks_per_layer.size(); layer_idx++) {
                             m_free_blocks[layer_idx].push_back(colliding_blocks_per_layer[layer_idx]);
+                            ++m_free_blocks_num[layer_idx];
                         }
                     }
                     m_overwriteable_blocks.add(blocks_for_all_layers);
@@ -333,12 +337,14 @@ class BlockAllocator {
                     // TODO (vshampor): more fine-grained hash store control
                     for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) {
                         m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]);
+                        ++m_free_blocks_num[layer_idx];
                     }
                 }
             }
             else {
                 for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) {
                     m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]);
+                    ++m_free_blocks_num[layer_idx];
                 }
             }
         }
@@ -368,6 +374,7 @@ class BlockAllocator {
         KVCacheBlock::Ptr allocated_block = m_free_blocks[layer_idx].front();
         allocated_block->increment();
         m_free_blocks[layer_idx].pop_front();
+        --m_free_blocks_num[layer_idx];
         return allocated_block;
     }
 
@@ -386,7 +393,7 @@ class BlockAllocator {
         OPENVINO_ASSERT(m_enable_prefix_caching);
         OPENVINO_ASSERT(can_allocate_blocks(1));
 
-        if (m_free_blocks[0].size() > 0) {
+        if (m_free_blocks_num[0] > 0) {
             // allocate new empty block
             BlocksPerLayer allocated_blocks;
             allocated_blocks.reserve(m_num_layers);
@@ -396,6 +403,7 @@ class BlockAllocator {
                 allocated_block->set_hash(hash);
                 allocated_blocks.push_back(allocated_block);
                 m_free_blocks[i].pop_front();
+                --m_free_blocks_num[i];
             }
             cached_blocks[hash] = allocated_blocks;
             return allocated_blocks;

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -21,7 +21,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
 
     ov::Core core;
 
-    auto [core_properties, compile_properties] = utils::split_core_complile_config(properties);
+    auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
     core.set_property(core_properties);
 
     // The model can be compiled for GPU as well
@@ -57,7 +57,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
     }
 
     SchedulerConfig updated_config = scheduler_config;
-    // update KV number in scheduler config
+    // update KV blocks number in scheduler config
     if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) {
         updated_config.num_kv_blocks = device_config.get_num_kv_blocks();
     }
@@ -166,24 +166,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
         timer.start();
         logits = m_model_runner->forward(m_requests, scheduler_output);
         timer.end();
-
-        ov::InferRequest infer_request = m_model_runner->get_infer_request();
-        ov::CompiledModel compiled_model = infer_request.get_compiled_model();
-        const bool is_profiling_enabled = compiled_model.get_property(ov::enable_profiling);
-
-        // collect detailed statistic
-        if (is_profiling_enabled) {
-            std::vector<ov::ProfilingInfo> profiling_info = m_model_runner->get_infer_request().get_profiling_info();
-            for (const ov::ProfilingInfo& info : profiling_info) {
-                double current_time = info.real_time.count();
-                if (info.node_type == "PagedAttentionExtension") {
-                    m_perf.m_paged_attention_time_ms += current_time;
-                } else if (info.node_type == "FullyConnected") {
-                    m_perf.m_matmul_time_ms += current_time;
-                }
-                m_perf.m_infer_total_ms += current_time;
-            }
-        }
     }
 
 #ifdef DEBUG_CACHE_STATE_DUMP