From b955ea6e778c51394c18e3917c52725554d71c04 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Thu, 12 Dec 2024 13:46:16 +0100
Subject: [PATCH 01/18] [GHA] Minimize memory consumption during tests (#1371)

This is a workaround to minimize memory consumption during tests and
allow the use of less powerful CI runners

---------

Co-authored-by: Alexander Suvorov <alexander.suvorov@intel.com>
---
 .github/workflows/linux.yml                     | 4 +---
 tests/python_tests/test_whisper_generate_api.py | 9 +++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 0d1dc9f948..96848e947c 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -271,14 +271,12 @@ jobs:
         test:
           - name: 'Whisper'
             cmd: 'tests/python_tests/test_whisper_generate_api.py'
-            runner: aks-linux-8-cores-32gb
           - name: 'LLM & VLM'
             cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py -k "not Qwen2-0.5B-Instruct"' # Skip failed tests Qwen2-0.5B-Instruct
-            runner: aks-linux-4-cores-16gb
     defaults:
       run:
         shell: bash
-    runs-on: ${{ matrix.test.runner }}
+    runs-on: aks-linux-4-cores-16gb
     container:
       image: openvinogithubactions.azurecr.io/ov_test/ubuntu_22_04_x64:${{ needs.openvino_download.outputs.docker_tag }}
       volumes: 
diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py
index bcbe2890bd..5a68dd98b6 100644
--- a/tests/python_tests/test_whisper_generate_api.py
+++ b/tests/python_tests/test_whisper_generate_api.py
@@ -10,11 +10,20 @@
 import datasets
 from transformers import WhisperProcessor, pipeline, AutoTokenizer
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
+import gc
 import json
 import time
 import typing
 import numpy as np
 
+@pytest.fixture(scope="class", autouse=True)
+def run_gc_after_test():
+    """
+    Fixture to run garbage collection after each test class.
+    This is a workaround to minimize memory consumption during tests and allow the use of less powerful CI runners.
+    """
+    yield
+    gc.collect()
 
 @functools.lru_cache(1)
 def read_whisper_model(params, **tokenizer_kwargs):

From d17f7168f278ef98acfdc7ba1ac93e4c759a6402 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 13 Dec 2024 08:03:11 +0400
Subject: [PATCH 02/18] [Image generation] Added num_steps to callback (#1372)

With image to image and inpainting, an user passed `num_inference_steps`
is scaled based on `strength` parameter.
So, we need to report actual number of steps within `callback`

CC @RyanMetcalfeInt8
---
 samples/cpp/image_generation/README.md        |  6 ++---
 samples/python/image_generation/README.md     |  6 ++---
 .../image_generation/generation_config.hpp    |  4 ++--
 .../src/image_generation/flux_pipeline.hpp    | 22 ++++++-------------
 .../stable_diffusion_3_pipeline.hpp           | 21 ++++++++----------
 .../stable_diffusion_pipeline.hpp             | 13 +++++------
 src/python/py_utils.cpp                       |  2 +-
 tools/llm_bench/llm_bench_utils/ov_utils.py   |  2 +-
 8 files changed, 31 insertions(+), 45 deletions(-)

diff --git a/samples/cpp/image_generation/README.md b/samples/cpp/image_generation/README.md
index 795bea8999..8a5cc5aa19 100644
--- a/samples/cpp/image_generation/README.md
+++ b/samples/cpp/image_generation/README.md
@@ -52,9 +52,9 @@ Please find the template of the callback usage below.
 ```cpp
 ov::genai::Text2ImagePipeline pipe(models_path, device);
 
-auto callback = [&](size_t step, ov::Tensor& intermediate_res) -> bool {
-   std::cout << "Image generation step: " << step << std::endl;
-   ov::Tensor img = pipe.decode(intermediate_res); // get intermediate image tensor
+auto callback = [&](size_t step, size_t num_steps, ov::Tensor& latent) -> bool {
+   std::cout << "Image generation step: " << step << " / " << num_steps << std::endl;
+   ov::Tensor img = pipe.decode(latent); // get intermediate image tensor
    if (your_condition) // return true if you want to interrupt image generation
       return true;
    return false;
diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 4abe45b2b4..321f3f6d05 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -52,9 +52,9 @@ Please find the template of the callback usage below.
 ```python
 pipe = openvino_genai.Text2ImagePipeline(model_dir, device)
 
-def callback(step, intermediate_res):
-   print("Image generation step: ", step)
-   image_tensor = pipe.decode(intermediate_res) # get intermediate image tensor
+def callback(step, num_steps, latent):
+   print(f"Image generation step: {step} / {num_steps}")
+   image_tensor = pipe.decode(latent) # get intermediate image tensor
    if your_condition: # return True if you want to interrupt image generation
       return True
    return False
diff --git a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
index 0b749ecd83..50e576466d 100644
--- a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
@@ -216,11 +216,11 @@ static constexpr ov::Property<int> max_sequence_length{"max_sequence_length"};
 
 /**
  * User callback for image generation pipelines, which is called within a pipeline with the following arguments:
- * - Total number of inference steps. Note, that in case of 'strength' parameter, the number of inference steps is reduced linearly
  * - Current inference step
+ * - Total number of inference steps. Note, that in case of 'strength' parameter, the number of inference steps is reduced linearly
  * - Tensor representing current latent. Such latent can be converted to human-readable representation via image generation pipeline 'decode()' method
  */
-static constexpr ov::Property<std::function<bool(size_t, ov::Tensor&)>> callback{"callback"};
+static constexpr ov::Property<std::function<bool(size_t, size_t, ov::Tensor&)>> callback{"callback"};
 
 /**
  * Function to pass 'ImageGenerationConfig' as property to 'generate()' call.
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index 4cdac5bb1a..ac82bd0cab 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -326,9 +326,11 @@ class FluxPipeline : public DiffusionPipeline {
             m_custom_generation_config.strength = 1.0f;
         }
 
-        if (!initial_image) {
-            // in case of typical text to image generation, we need to ignore 'strength'
-            m_custom_generation_config.strength = 1.0f;
+        // Use callback if defined
+        std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
+        auto callback_iter = properties.find(ov::genai::callback.name());
+        if (callback_iter != properties.end()) {
+            callback = callback_iter->second.as<std::function<bool(size_t, size_t, ov::Tensor&)>>();
         }
 
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
@@ -355,14 +357,6 @@ class FluxPipeline : public DiffusionPipeline {
         m_scheduler->set_timesteps_with_sigma(sigmas, mu);
         std::vector<float> timesteps = m_scheduler->get_float_timesteps();
 
-        // Use callback if defined
-        std::function<bool(size_t, ov::Tensor&)> callback;
-        auto callback_iter = properties.find(ov::genai::callback.name());
-        bool do_callback = callback_iter != properties.end();
-        if (do_callback) {
-            callback = callback_iter->second.as<std::function<bool(size_t, ov::Tensor&)>>();
-        }
-
         // 6. Denoising loop
         ov::Tensor timestep(ov::element::f32, {1});
         float* timestep_data = timestep.data<float>();
@@ -375,10 +369,8 @@ class FluxPipeline : public DiffusionPipeline {
             auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator);
             latents = scheduler_step_result["latent"];
 
-            if (do_callback) {
-                if (callback(inference_step, latents)) {
-                    return ov::Tensor(ov::element::u8, {});
-                }
+            if (callback && callback(inference_step, timesteps.size(), latents)) {
+                return ov::Tensor(ov::element::u8, {});
             }
         }
 
diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
index 4e9a70ec2d..3cdaa409d1 100644
--- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -431,6 +431,13 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
             generation_config.strength = 1.0f;
         }
 
+        // Use callback if defined
+        std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
+        auto callback_iter = properties.find(ov::genai::callback.name());
+        if (callback_iter != properties.end()) {
+            callback = callback_iter->second.as<std::function<bool(size_t, size_t, ov::Tensor&)>>();
+        }
+
         const auto& transformer_config = m_transformer->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale)
@@ -467,14 +474,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         // 6. Denoising loop
         ov::Tensor noisy_residual_tensor(ov::element::f32, {});
 
-        // Use callback if defined
-        std::function<bool(size_t, ov::Tensor&)> callback;
-        auto callback_iter = properties.find(ov::genai::callback.name());
-        bool do_callback = callback_iter != properties.end();
-        if (do_callback) {
-            callback = callback_iter->second.as<std::function<bool(size_t, ov::Tensor&)>>();
-        }
-
         for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) {
             // concat the same latent twice along a batch dimension in case of CFG
             if (batch_size_multiplier > 1) {
@@ -510,10 +509,8 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
             auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step, generation_config.generator);
             latent = scheduler_step_result["latent"];
 
-            if (do_callback) {
-                if (callback(inference_step, latent)) {
-                    return ov::Tensor(ov::element::u8, {});
-                }
+            if (callback && callback(inference_step, timesteps.size(), latent)) {
+                return ov::Tensor(ov::element::u8, {});
             }
         }
 
diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
index 9dbdbac088..c53c9b7d25 100644
--- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -306,11 +306,10 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         }
 
         // use callback if defined
-        std::function<bool(size_t, ov::Tensor&)> callback;
+        std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
         auto callback_iter = properties.find(ov::genai::callback.name());
-        bool do_callback = callback_iter != properties.end();
-        if (do_callback) {
-            callback = callback_iter->second.as<std::function<bool(size_t, ov::Tensor&)>>();
+        if (callback_iter != properties.end()) {
+            callback = callback_iter->second.as<std::function<bool(size_t, size_t, ov::Tensor&)>>();
         }
 
         // Stable Diffusion pipeline
@@ -400,10 +399,8 @@ class StableDiffusionPipeline : public DiffusionPipeline {
             const auto it = scheduler_step_result.find("denoised");
             denoised = it != scheduler_step_result.end() ? it->second : latent;
 
-            if (do_callback) {
-                if (callback(inference_step, denoised)) {
-                    return ov::Tensor(ov::element::u8, {});
-                }
+            if (callback && callback(inference_step, timesteps.size(), denoised)) {
+                return ov::Tensor(ov::element::u8, {});
             }
         }
 
diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
index 9d33318f0a..45a0c46174 100644
--- a/src/python/py_utils.cpp
+++ b/src/python/py_utils.cpp
@@ -280,7 +280,7 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) {
     } else if (py::isinstance<ov::genai::Generator>(py_obj)) {
         return py::cast<std::shared_ptr<ov::genai::Generator>>(py_obj);
     } else if (py::isinstance<py::function>(py_obj) && property_name == "callback") {
-        return py::cast<std::function<bool(size_t, ov::Tensor&)>>(py_obj);
+        return py::cast<std::function<bool(size_t, size_t, ov::Tensor&)>>(py_obj);
     } else if ((py::isinstance<py::function>(py_obj) || py::isinstance<ov::genai::StreamerBase>(py_obj) || py::isinstance<std::monostate>(py_obj)) && property_name == "streamer") {
         auto streamer = py::cast<ov::genai::pybind::utils::PyBindStreamerVariant>(py_obj);
         return ov::genai::streamer(pystreamer_to_streamer(streamer)).second;
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index c5fa422824..8a28fbe355 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -366,7 +366,7 @@ def __init__(self) -> types.NoneType:
             self.start_time = time.perf_counter()
             self.duration = -1
 
-        def __call__(self, step, latents):
+        def __call__(self, step, num_steps, latents):
             self.iteration_time.append(time.perf_counter() - self.start_time)
             self.start_time = time.perf_counter()
             return False

From d189eb7541a61a41581dd21361db3aa3884d211b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 13 Dec 2024 15:46:38 +0400
Subject: [PATCH 03/18] GHA: use preconverted LCM model (#1380)

---
 .github/workflows/lcm_dreamshaper_cpp.yml | 24 +++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index b3a36761e1..258184e9e4 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -62,35 +62,35 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ./samples/requirements.txt
 
-      - name: Download and convert models and tokenizer
+      - name: Download models
         run: |
           source openvino_lcm_cpp/bin/activate
-          optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16
+          huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir models/lcm_dreamshaper_v7
           wget -O ./image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png
           wget -O ./mask_image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png
 
       - name: Run heterogeneous_stable_diffusion
         run: |
           source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
-          ${{ env.build_dir }}/samples/cpp/image_generation/heterogeneous_stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
+          ${{ env.build_dir }}/samples/cpp/image_generation/heterogeneous_stable_diffusion ./models/lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
 
       - name: Run heterogeneous_stable_diffusion.py
         run: |
           source openvino_lcm_cpp/bin/activate
           source ./ov/setupvars.sh
-          python ./samples/python/image_generation/heterogeneous_stable_diffusion.py ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"
+          python ./samples/python/image_generation/heterogeneous_stable_diffusion.py ./models/lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"
         env:
           PYTHONPATH: ${{ env.build_dir }}
 
       - name: Run image2image
         run: |
           source ./ov/setupvars.sh
-          ${{ env.build_dir }}/samples/cpp/image_generation/image2image ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png
+          ${{ env.build_dir }}/samples/cpp/image_generation/image2image ./models/lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png
 
       - name: Run inpainting
         run: |
           source ./ov/setupvars.sh
-          ${{ env.build_dir }}/samples/cpp/image_generation/inpainting ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png ./mask_image.png
+          ${{ env.build_dir }}/samples/cpp/image_generation/inpainting ./models/lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png ./mask_image.png
 
   lcm_dreamshaper_v7_cpp-windows:
     runs-on: windows-2019
@@ -134,24 +134,24 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ./samples/requirements.txt
 
-      - name: Download and convert models and tokenizer
+      - name: Download models
         run: |
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
-          optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16
+          huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir models/lcm_dreamshaper_v7
           Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png' -OutFile 'image.png'
           Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png' -OutFile 'mask_image.png'
 
       - name: Run heterogeneous_stable_diffusion
         run: >
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          & "${{ env.build_dir }}/samples/cpp/image_generation/Release/heterogeneous_stable_diffusion.exe ./models/lcm_dreamshaper_v7/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'"
+          & "${{ env.build_dir }}/samples/cpp/image_generation/Release/heterogeneous_stable_diffusion.exe ./models/lcm_dreamshaper_v7 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'"
 
       - name: Run heterogeneous_stable_diffusion.py
         run: |
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           $env:Path += "${{ env.build_dir }}\openvino_genai"
-          python .\samples\python\image_generation\heterogeneous_stable_diffusion.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"
+          python .\samples\python\image_generation\heterogeneous_stable_diffusion.py .\models\lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"
         env:
           PYTHONPATH: ${{ env.build_dir }}
 
@@ -160,7 +160,7 @@ jobs:
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           $env:Path += "${{ env.build_dir }}\openvino_genai"
-          python .\samples\python\image_generation\image2image.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png
+          python .\samples\python\image_generation\image2image.py .\models\lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png
         env:
           PYTHONPATH: ${{ env.build_dir }}
 
@@ -169,7 +169,7 @@ jobs:
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           $env:Path += "${{ env.build_dir }}\openvino_genai"
-          python .\samples\python\image_generation\inpainting.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png .\mask_image.png
+          python .\samples\python\image_generation\inpainting.py .\models\lcm_dreamshaper_v7 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png .\mask_image.png
         env:
           PYTHONPATH: ${{ env.build_dir }}
 

From 1b7a9e58433c8e78467db55549df8a1ab65b11b5 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Fri, 13 Dec 2024 19:26:27 +0100
Subject: [PATCH 04/18] [GHA] Use the latest stable ov commit (#1385)

Temporary freeze OV commit until regression caused by
https://github.com/openvinotoolkit/openvino/commit/f1cba31319c3a2b150a801ea969bfe463041d5fc
is fixed
---
 .github/workflows/linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 96848e947c..18107aa203 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -53,7 +53,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: latest_available_commit
+        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

From 1e9e2c09a7698fb84fea6984e7588ea8a718c842 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 14 Dec 2024 00:47:09 +0400
Subject: [PATCH 05/18] [Image generation] Added conversion between pipelines
 (#1375)

Created constructors for image generation pipelines to share models
between pipelines, but generation configs are initialized as default
(because, text2image and image2image have different default values for
some parameters and cannot be shared as is)
---
 .../image_generation/image2image_pipeline.hpp | 25 +++------
 .../image_generation/inpainting_pipeline.hpp  | 11 ++++
 .../image_generation/text2image_pipeline.hpp  | 47 ++++++++--------
 .../src/image_generation/flux_pipeline.hpp    | 56 +++++++++++++------
 .../image_generation/image2image_pipeline.cpp | 10 ++++
 .../image_generation/inpainting_pipeline.cpp  | 17 +++++-
 .../image_generation/schedulers/scheduler.cpp |  1 -
 .../stable_diffusion_3_pipeline.hpp           | 54 ++++++++++++++----
 .../stable_diffusion_pipeline.hpp             | 39 ++++++++-----
 .../stable_diffusion_xl_pipeline.hpp          | 18 +++++-
 .../image_generation/text2image_pipeline.cpp  | 28 ++++++++++
 .../openvino_genai/py_openvino_genai.pyi      | 12 ++++
 src/python/py_image_generation_pipelines.cpp  | 40 ++++++++-----
 13 files changed, 256 insertions(+), 102 deletions(-)

diff --git a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
index a859b29c2e..ea02969c5e 100644
--- a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
@@ -3,28 +3,11 @@
 
 #pragma once
 
-#include <memory>
-#include <string>
-#include <random>
-#include <filesystem>
-
-#include "openvino/core/any.hpp"
-#include "openvino/runtime/tensor.hpp"
-
-#include "openvino/genai/image_generation/scheduler.hpp"
-#include "openvino/genai/image_generation/generation_config.hpp"
-
-#include "openvino/genai/image_generation/clip_text_model.hpp"
-#include "openvino/genai/image_generation/clip_text_model_with_projection.hpp"
-#include "openvino/genai/image_generation/unet2d_condition_model.hpp"
-#include "openvino/genai/image_generation/autoencoder_kl.hpp"
+#include "openvino/genai/image_generation/inpainting_pipeline.hpp"
 
 namespace ov {
 namespace genai {
 
-// forward declaration
-class DiffusionPipeline;
-
 //
 // Image to image pipeline
 //
@@ -42,6 +25,8 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline {
                         Properties&&... properties)
         : Image2ImagePipeline(models_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    Image2ImagePipeline(const InpaintingPipeline& pipe);
+
     // creates either LCM or SD pipeline from building blocks
     static Image2ImagePipeline stable_diffusion(
         const std::shared_ptr<Scheduler>& scheduler_type,
@@ -99,6 +84,10 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline {
     std::shared_ptr<DiffusionPipeline> m_impl;
 
     explicit Image2ImagePipeline(const std::shared_ptr<DiffusionPipeline>& impl);
+
+    // to create other pipelines from image to image
+    friend class Text2ImagePipeline;
+    friend class InpaintingPipeline;
 };
 
 } // namespace genai
diff --git a/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp
index c970fa0e23..6eead673e4 100644
--- a/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp
@@ -18,12 +18,17 @@
 #include "openvino/genai/image_generation/clip_text_model_with_projection.hpp"
 #include "openvino/genai/image_generation/unet2d_condition_model.hpp"
 #include "openvino/genai/image_generation/autoencoder_kl.hpp"
+#include "openvino/genai/image_generation/t5_encoder_model.hpp"
+#include "openvino/genai/image_generation/sd3_transformer_2d_model.hpp"
+#include "openvino/genai/image_generation/flux_transformer_2d_model.hpp"
 
 namespace ov {
 namespace genai {
 
 // forward declaration
 class DiffusionPipeline;
+class Text2ImagePipeline;
+class Image2ImagePipeline;
 
 //
 // Inpainting pipeline
@@ -42,6 +47,8 @@ class OPENVINO_GENAI_EXPORTS InpaintingPipeline {
                        Properties&&... properties)
         : InpaintingPipeline(models_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    InpaintingPipeline(const Image2ImagePipeline& pipe);
+
     // creates either LCM or SD pipeline from building blocks
     static InpaintingPipeline stable_diffusion(
         const std::shared_ptr<Scheduler>& scheduler_type,
@@ -100,6 +107,10 @@ class OPENVINO_GENAI_EXPORTS InpaintingPipeline {
     std::shared_ptr<DiffusionPipeline> m_impl;
 
     explicit InpaintingPipeline(const std::shared_ptr<DiffusionPipeline>& impl);
+
+    // to create other pipelines from inpainting
+    friend class Text2ImagePipeline;
+    friend class Image2ImagePipeline;
 };
 
 } // namespace genai
diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
index b66ced748b..34b9d6e341 100644
--- a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
@@ -3,31 +3,11 @@
 
 #pragma once
 
-#include <memory>
-#include <string>
-#include <random>
-#include <filesystem>
-
-#include "openvino/core/any.hpp"
-#include "openvino/runtime/tensor.hpp"
-
-#include "openvino/genai/image_generation/scheduler.hpp"
-#include "openvino/genai/image_generation/generation_config.hpp"
-
-#include "openvino/genai/image_generation/clip_text_model.hpp"
-#include "openvino/genai/image_generation/clip_text_model_with_projection.hpp"
-#include "openvino/genai/image_generation/unet2d_condition_model.hpp"
-#include "openvino/genai/image_generation/sd3_transformer_2d_model.hpp"
-#include "openvino/genai/image_generation/autoencoder_kl.hpp"
-#include "openvino/genai/image_generation/t5_encoder_model.hpp"
-#include "openvino/genai/image_generation/flux_transformer_2d_model.hpp"
+#include "openvino/genai/image_generation/image2image_pipeline.hpp"
 
 namespace ov {
 namespace genai {
 
-// forward declaration
-class DiffusionPipeline;
-
 /**
  * Text to image pipelines which provides unified API to all supported models types.
  * Models specific aspects are hidden in image generation config, which includes multiple prompts support or
@@ -63,6 +43,20 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
                        Properties&&... properties)
         : Text2ImagePipeline(models_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
 
+    /**
+     * Creates text to image pipeline based on image to image pipeline and shares models
+     * @param pipe Image to image pipeline to share models with
+     * @note Generation config is not shared with image to image pipeline and default one is created
+     */
+    Text2ImagePipeline(const Image2ImagePipeline& pipe);
+
+    /**
+     * Creates text to image pipeline based on inpainting pipeline and shares models
+     * @param pipe Inpainting pipeline to share models with
+     * @note Generation config is not shared with image to image pipeline and default one is created
+     */
+    Text2ImagePipeline(const InpaintingPipeline& pipe);
+
     /**
      * Creates Stable Diffusion pipeline from individual models
      * @param scheduler A scheduler used to denoise final image
@@ -178,6 +172,15 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
      * @param guidance_scale A guidance scale. Note, that it's important whether guidance_scale > 1, which affects whether negative prompts
      * are used or not. For example, all values > 1 are the same for reshape perspective and may vary in subsequent 'generate()' calls.
      * @note If pipeline has been already compiled, it cannot be reshaped and an exception is thrown.
+     * 
+     * Example how to reshape SD3 or Flux models for specific max sequence length:
+     * @code
+     *  ov::genai::Text2ImagePipeline pipe("/path");
+     *  ov::genai::ImageGenerationConfig default_config = pipe.get_generation_config();
+     *  default_config.max_sequence_length = 30;
+     *  pipe.set_generation_config(default_config);
+     *  pipe.reshape(1, 512, 512, default_config.guidance_scale); // reshape will bypass `max_sequence_length` to T5 encoder model
+     * @endcode
      */
     void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale);
 
@@ -200,7 +203,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
      * Generates image(s) based on prompt and other image generarion parameters
      * @param positive_prompt Prompt to generate image(s) from
      * @param properties Image generation parameters specified as properties. Values in 'properties' override default value for generation parameters.
-     * @return A tensor which has dimensions [num_images_per_prompt, height, width, 3]
+     * @returns A tensor which has dimensions [num_images_per_prompt, height, width, 3]
      */
     ov::Tensor generate(const std::string& positive_prompt, const ov::AnyMap& properties = {});
 
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index ac82bd0cab..716ba6b61b 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -225,6 +225,15 @@ class FluxPipeline : public DiffusionPipeline {
         initialize_generation_config("FluxPipeline");
     }
 
+    FluxPipeline(PipelineType pipeline_type, const FluxPipeline& pipe) :
+        FluxPipeline(pipe) {
+        OPENVINO_ASSERT(!pipe.is_inpainting_model(), "Cannot create ",
+            pipeline_type == PipelineType::TEXT_2_IMAGE ? "'Text2ImagePipeline'" : "'Image2ImagePipeline'", " from InpaintingPipeline with inpainting model");
+
+        m_pipeline_type = pipeline_type;
+        initialize_generation_config("FluxPipeline");
+    }
+
     void reshape(const int num_images_per_prompt,
                  const int height,
                  const int width,
@@ -232,13 +241,6 @@ class FluxPipeline : public DiffusionPipeline {
         check_image_size(height, width);
 
         m_clip_text_encoder->reshape(1);
-
-        // TODO: max_sequence_length cannot be specified easily outside, only via:
-        //   Text2ImagePipeline pipe("/path");
-        //   ImageGenerationConfig default_config = pipe.get_generation_config();
-        //   default_config.max_sequence_length = 30;
-        //   pipe.set_generation_config(default_config);
-        //   pipe.reshape(1, 512, 512, default_config.guidance_scale);
         m_t5_text_encoder->reshape(1, m_generation_config.max_sequence_length);
         m_transformer->reshape(num_images_per_prompt, height, width, m_generation_config.max_sequence_length);
 
@@ -321,11 +323,6 @@ class FluxPipeline : public DiffusionPipeline {
         m_custom_generation_config = m_generation_config;
         m_custom_generation_config.update_generation_config(properties);
 
-        if (!initial_image) {
-            // in case of typical text to image generation, we need to ignore 'strength'
-            m_custom_generation_config.strength = 1.0f;
-        }
-
         // Use callback if defined
         std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
         auto callback_iter = properties.find(ov::genai::callback.name());
@@ -337,9 +334,9 @@ class FluxPipeline : public DiffusionPipeline {
         const auto& transformer_config = m_transformer->get_config();
 
         if (m_custom_generation_config.height < 0)
-            m_custom_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
+            compute_dim(m_custom_generation_config.height, initial_image, 1 /* assume NHWC */);
         if (m_custom_generation_config.width < 0)
-            m_custom_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
+            compute_dim(m_custom_generation_config.width, initial_image, 2 /* assume NHWC */);
 
         check_inputs(m_custom_generation_config, initial_image);
 
@@ -387,6 +384,29 @@ class FluxPipeline : public DiffusionPipeline {
     }
 
 private:
+    bool is_inpainting_model() const {
+        assert(m_transformer != nullptr);
+        assert(m_vae != nullptr);
+        return m_transformer->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1);
+    }
+
+    void compute_dim(int64_t & generation_config_value, ov::Tensor initial_image, int dim_idx) {
+        const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
+        const auto& transformer_config = m_transformer->get_config();
+
+        // in case of image to image generation_config_value is just ignored and computed based on initial image
+        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
+            OPENVINO_ASSERT(initial_image, "Initial image is empty for image to image pipeline");
+            ov::Shape shape = initial_image.get_shape();
+            int64_t dim_val = shape[dim_idx];
+
+            generation_config_value = dim_val - (dim_val % vae_scale_factor);
+        }
+
+        if (generation_config_value < 0)
+            generation_config_value = transformer_config.m_default_sample_size * vae_scale_factor;
+    }
+
     void initialize_generation_config(const std::string& class_name) override {
         assert(m_transformer != nullptr);
         assert(m_vae != nullptr);
@@ -394,8 +414,12 @@ class FluxPipeline : public DiffusionPipeline {
         const auto& transformer_config = m_transformer->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
 
-        m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
-        m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
+        m_generation_config = ImageGenerationConfig();
+
+        if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
+            m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
+            m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
+        }
 
         if (class_name == "FluxPipeline" || class_name == "FluxImg2ImgPipeline" || class_name == "FluxInpaintPipeline" ) {
             if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) {
diff --git a/src/cpp/src/image_generation/image2image_pipeline.cpp b/src/cpp/src/image_generation/image2image_pipeline.cpp
index 527b532b71..38ff5a0a4c 100644
--- a/src/cpp/src/image_generation/image2image_pipeline.cpp
+++ b/src/cpp/src/image_generation/image2image_pipeline.cpp
@@ -39,6 +39,16 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir,
     }
 }
 
+Image2ImagePipeline::Image2ImagePipeline(const InpaintingPipeline& pipe) {
+    if (auto stable_diffusion_xl = std::dynamic_pointer_cast<StableDiffusionXLPipeline>(pipe.m_impl); stable_diffusion_xl != nullptr) {
+        m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, *stable_diffusion_xl);
+    } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
+        m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, *stable_diffusion);
+    } else {
+        OPENVINO_ASSERT("Cannot convert specified InpaintingPipeline to Image2ImagePipeline");
+    }
+}
+
 Image2ImagePipeline::Image2ImagePipeline(const std::shared_ptr<DiffusionPipeline>& impl)
     : m_impl(impl) {
     assert(m_impl != nullptr);
diff --git a/src/cpp/src/image_generation/inpainting_pipeline.cpp b/src/cpp/src/image_generation/inpainting_pipeline.cpp
index d3612c4964..a510be0a57 100644
--- a/src/cpp/src/image_generation/inpainting_pipeline.cpp
+++ b/src/cpp/src/image_generation/inpainting_pipeline.cpp
@@ -6,6 +6,7 @@
 #include <filesystem>
 
 #include "openvino/genai/image_generation/inpainting_pipeline.hpp"
+#include "openvino/genai/image_generation/image2image_pipeline.hpp"
 
 #include "image_generation/stable_diffusion_pipeline.hpp"
 #include "image_generation/stable_diffusion_xl_pipeline.hpp"
@@ -43,6 +44,16 @@ InpaintingPipeline::InpaintingPipeline(const std::filesystem::path& root_dir, co
     }
 }
 
+InpaintingPipeline::InpaintingPipeline(const Image2ImagePipeline& pipe) {
+    if (auto stable_diffusion_xl = std::dynamic_pointer_cast<StableDiffusionXLPipeline>(pipe.m_impl); stable_diffusion_xl != nullptr) {
+        m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::INPAINTING, *stable_diffusion_xl);
+    } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
+        m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::INPAINTING, *stable_diffusion);
+    } else {
+        OPENVINO_ASSERT("Cannot convert specified Image2ImagePipeline to InpaintingPipeline");
+    }
+}
+
 InpaintingPipeline::InpaintingPipeline(const std::shared_ptr<DiffusionPipeline>& impl)
     : m_impl(impl) {
     assert(m_impl != nullptr);
@@ -53,7 +64,7 @@ InpaintingPipeline InpaintingPipeline::stable_diffusion(
     const CLIPTextModel& clip_text_model,
     const UNet2DConditionModel& unet,
     const AutoencoderKL& vae) {
-    auto impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, clip_text_model, unet, vae);
+    auto impl = std::make_shared<StableDiffusionPipeline>(PipelineType::INPAINTING, clip_text_model, unet, vae);
 
     assert(scheduler != nullptr);
     impl->set_scheduler(scheduler);
@@ -66,7 +77,7 @@ InpaintingPipeline InpaintingPipeline::latent_consistency_model(
     const CLIPTextModel& clip_text_model,
     const UNet2DConditionModel& unet,
     const AutoencoderKL& vae) {
-    auto impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, clip_text_model, unet, vae);
+    auto impl = std::make_shared<StableDiffusionPipeline>(PipelineType::INPAINTING, clip_text_model, unet, vae);
 
     assert(scheduler != nullptr);
     impl->set_scheduler(scheduler);
@@ -80,7 +91,7 @@ InpaintingPipeline InpaintingPipeline::stable_diffusion_xl(
     const CLIPTextModelWithProjection& clip_text_model_with_projection,
     const UNet2DConditionModel& unet,
     const AutoencoderKL& vae) {
-    auto impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, clip_text_model, clip_text_model_with_projection, unet, vae);
+    auto impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::INPAINTING, clip_text_model, clip_text_model_with_projection, unet, vae);
 
     assert(scheduler != nullptr);
     impl->set_scheduler(scheduler);
diff --git a/src/cpp/src/image_generation/schedulers/scheduler.cpp b/src/cpp/src/image_generation/schedulers/scheduler.cpp
index 6ec31bbf6c..3a7556b6d9 100644
--- a/src/cpp/src/image_generation/schedulers/scheduler.cpp
+++ b/src/cpp/src/image_generation/schedulers/scheduler.cpp
@@ -29,7 +29,6 @@ std::shared_ptr<Scheduler> Scheduler::from_config(const std::filesystem::path& s
 
     std::shared_ptr<Scheduler> scheduler = nullptr;
     if (scheduler_type == Scheduler::Type::LCM) {
-        // TODO: do we need to pass RNG generator somehow to LCM?
         scheduler = std::make_shared<LCMScheduler>(scheduler_config_path);
     } else if (scheduler_type == Scheduler::Type::LMS_DISCRETE) {
         scheduler = std::make_shared<LMSDiscreteScheduler>(scheduler_config_path);
diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
index 3cdaa409d1..18a3e0346f 100644
--- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -218,6 +218,15 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         initialize_generation_config("StableDiffusion3Pipeline");
     }
 
+    StableDiffusion3Pipeline(PipelineType pipeline_type, const StableDiffusion3Pipeline& pipe) :
+        StableDiffusion3Pipeline(pipe) {
+        OPENVINO_ASSERT(!pipe.is_inpainting_model(), "Cannot create ",
+            pipeline_type == PipelineType::TEXT_2_IMAGE ? "'Text2ImagePipeline'" : "'Image2ImagePipeline'", " from InpaintingPipeline with inpainting model");
+
+        m_pipeline_type = pipeline_type;
+        initialize_generation_config("StableDiffusion3Pipeline");
+    }
+
     void reshape(const int num_images_per_prompt,
                  const int height,
                  const int width,
@@ -426,11 +435,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         ImageGenerationConfig generation_config = m_generation_config;
         generation_config.update_generation_config(properties);
 
-        if (!initial_image) {
-            // in case of typical text to image generation, we need to ignore 'strength'
-            generation_config.strength = 1.0f;
-        }
-
         // Use callback if defined
         std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
         auto callback_iter = properties.find(ov::genai::callback.name());
@@ -440,14 +444,12 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
 
         const auto& transformer_config = m_transformer->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
-        const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale)
-                                                 ? 2
-                                                 : 1;  // Transformer accepts 2x batch in case of CFG
+        const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1;  // Transformer accepts 2x batch in case of CFG
 
         if (generation_config.height < 0)
-            generation_config.height = transformer_config.sample_size * vae_scale_factor;
+            compute_dim(generation_config.height, initial_image, 1 /* assume NHWC */);
         if (generation_config.width < 0)
-            generation_config.width = transformer_config.sample_size * vae_scale_factor;
+            compute_dim(generation_config.width, initial_image, 2 /* assume NHWC */);
 
         check_inputs(generation_config, initial_image);
 
@@ -522,6 +524,29 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
     }
 
 private:
+    bool is_inpainting_model() const {
+        assert(m_transformer != nullptr);
+        assert(m_vae != nullptr);
+        return m_transformer->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1);
+    }
+
+    void compute_dim(int64_t & generation_config_value, ov::Tensor initial_image, int dim_idx) {
+        const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
+        const auto& transformer_config = m_transformer->get_config();
+
+        // in case of image to image generation_config_value is just ignored and computed based on initial image
+        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
+            OPENVINO_ASSERT(initial_image, "Initial image is empty for image to image pipeline");
+            ov::Shape shape = initial_image.get_shape();
+            int64_t dim_val = shape[dim_idx];
+
+            generation_config_value = dim_val - (dim_val % vae_scale_factor);
+        }
+
+        if (generation_config_value < 0)
+            generation_config_value = transformer_config.sample_size * vae_scale_factor;
+    }
+
     bool do_classifier_free_guidance(float guidance_scale) const {
         return guidance_scale > 1.0;
     }
@@ -533,8 +558,13 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         const auto& transformer_config = m_transformer->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
 
-        m_generation_config.height = transformer_config.sample_size * vae_scale_factor;
-        m_generation_config.width = transformer_config.sample_size * vae_scale_factor;
+        m_generation_config = ImageGenerationConfig();
+
+        // in case of image to image, the shape is computed based on initial image
+        if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
+            m_generation_config.height = transformer_config.sample_size * vae_scale_factor;
+            m_generation_config.width = transformer_config.sample_size * vae_scale_factor;
+        }
 
         if (class_name == "StableDiffusion3Pipeline" || class_name == "StableDiffusion3Img2ImgPipeline" || class_name == "StableDiffusion3InpaintPipeline") {
             m_generation_config.guidance_scale = 7.0f;
diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
index c53c9b7d25..4afbd3ac78 100644
--- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -147,6 +147,18 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         initialize_generation_config(pipeline_name);
     }
 
+    StableDiffusionPipeline(PipelineType pipeline_type, const StableDiffusionPipeline& pipe) :
+        StableDiffusionPipeline(pipe) {
+        OPENVINO_ASSERT(!pipe.is_inpainting_model(), "Cannot create ",
+            pipeline_type == PipelineType::TEXT_2_IMAGE ? "'Text2ImagePipeline'" : "'Image2ImagePipeline'", " from InpaintingPipeline with inpainting model");
+
+        m_pipeline_type = pipeline_type;
+
+        const bool is_lcm = m_unet->get_config().time_cond_proj_dim > 0;
+        const char * const pipeline_name = is_lcm ? "LatentConsistencyModelPipeline" : "StableDiffusionPipeline";
+        initialize_generation_config(pipeline_name);
+    }
+
     void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) override {
         check_image_size(height, width);
 
@@ -206,8 +218,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const bool is_inpainting = m_pipeline_type == PipelineType::INPAINTING,
             is_strength_max = is_inpainting && generation_config.strength == 1.0f,
-            is_inpainting_model = is_inpainting && m_unet->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1),
-            return_image_latent = is_inpainting && !is_inpainting_model;
+            return_image_latent = is_inpainting && !is_inpainting_model();
 
         ov::Shape latent_shape{generation_config.num_images_per_prompt, m_vae->get_config().latent_channels,
                                generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor};
@@ -254,7 +265,6 @@ class StableDiffusionPipeline : public DiffusionPipeline {
 
         const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1;  // Unet accepts 2x batch in case of CFG
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
-        const bool is_inpainting_model = m_unet->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1);
         ov::Shape target_shape = processed_image.get_shape();
 
         ov::Tensor mask_condition = m_image_resizer->execute(mask_image, target_shape[2], target_shape[3]);
@@ -266,7 +276,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
 
         ov::Tensor masked_image_latent;
 
-        if (is_inpainting_model) {
+        if (is_inpainting_model()) {
             // create masked image
             ov::Tensor masked_image(ov::element::f32, processed_image.get_shape());
             const float * mask_condition_data = mask_condition.data<const float>();
@@ -300,11 +310,6 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         ImageGenerationConfig generation_config = m_generation_config;
         generation_config.update_generation_config(properties);
 
-        if (!initial_image) {
-            // in case of typical text to image generation, we need to ignore 'strength'
-            generation_config.strength = 1.0f;
-        }
-
         // use callback if defined
         std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
         auto callback_iter = properties.find(ov::genai::callback.name());
@@ -318,12 +323,12 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         const auto& unet_config = m_unet->get_config();
         const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1;  // Unet accepts 2x batch in case of CFG
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
-        const bool is_inpainting_model = unet_config.in_channels == (m_vae->get_config().latent_channels * 2 + 1);
 
         if (generation_config.height < 0)
             compute_dim(generation_config.height, initial_image, 1 /* assume NHWC */);
         if (generation_config.width < 0)
             compute_dim(generation_config.width, initial_image, 2 /* assume NHWC */);
+
         check_inputs(generation_config, initial_image);
 
         set_lora_adapters(generation_config.adapters);
@@ -364,7 +369,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
 
             m_scheduler->scale_model_input(latent_cfg, inference_step);
 
-            ov::Tensor latent_model_input = is_inpainting_model ? numpy_utils::concat(numpy_utils::concat(latent_cfg, mask, 1), masked_image_latent, 1) : latent_cfg;
+            ov::Tensor latent_model_input = is_inpainting_model() ? numpy_utils::concat(numpy_utils::concat(latent_cfg, mask, 1), masked_image_latent, 1) : latent_cfg;
             ov::Tensor timestep(ov::element::i64, {1}, &timesteps[inference_step]);
             ov::Tensor noise_pred_tensor = m_unet->infer(latent_model_input, timestep);
 
@@ -391,7 +396,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
             latent = scheduler_step_result["latent"];
 
             // in case of non-specialized inpainting model, we need manually mask current denoised latent and initial image latent
-            if (m_pipeline_type == PipelineType::INPAINTING && !is_inpainting_model) {
+            if (m_pipeline_type == PipelineType::INPAINTING && !is_inpainting_model()) {
                 blend_latents(image_latent, noise, mask, latent, inference_step);
             }
 
@@ -412,6 +417,12 @@ class StableDiffusionPipeline : public DiffusionPipeline {
     }
 
 protected:
+    bool is_inpainting_model() const {
+        assert(m_unet != nullptr);
+        assert(m_vae != nullptr);
+        return m_unet->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1);
+    }
+
     void compute_dim(int64_t & generation_config_value, ov::Tensor initial_image, int dim_idx) {
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const auto& unet_config = m_unet->get_config();
@@ -435,13 +446,15 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         const auto& unet_config = m_unet->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
 
+        m_generation_config = ImageGenerationConfig();
+
         // in case of image to image, the shape is computed based on initial image
         if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
             m_generation_config.height = unet_config.sample_size * vae_scale_factor;
             m_generation_config.width = unet_config.sample_size * vae_scale_factor;
         }
 
-        if (class_name == "StableDiffusionPipeline" || class_name == "StableDiffusionInpaintPipeline" || class_name == "StableDiffusionInpaintPipeline") {
+        if (class_name == "StableDiffusionPipeline" || class_name == "StableDiffusionImg2ImgPipeline" || class_name == "StableDiffusionInpaintPipeline") {
             m_generation_config.guidance_scale = 7.5f;
             m_generation_config.num_inference_steps = 50;
             m_generation_config.strength = m_pipeline_type == PipelineType::IMAGE_2_IMAGE ? 0.8f : 1.0f;
diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
index 6913d901df..15f15219c2 100644
--- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
@@ -116,6 +116,15 @@ class StableDiffusionXLPipeline : public StableDiffusionPipeline {
         m_force_zeros_for_empty_prompt = true;
     }
 
+    StableDiffusionXLPipeline(PipelineType pipeline_type, const StableDiffusionXLPipeline& pipe) :
+        StableDiffusionXLPipeline(pipe) {
+        OPENVINO_ASSERT(!pipe.is_inpainting_model(), "Cannot create ",
+            pipeline_type == PipelineType::TEXT_2_IMAGE ? "'Text2ImagePipeline'" : "'Image2ImagePipeline'", " from InpaintingPipeline with inpainting model");
+
+        m_pipeline_type = pipeline_type;
+        initialize_generation_config("StableDiffusionXLPipeline");
+    }
+
     void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) override {
         check_image_size(height, width);
 
@@ -291,8 +300,13 @@ class StableDiffusionXLPipeline : public StableDiffusionPipeline {
         const auto& unet_config = m_unet->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
 
-        m_generation_config.height = unet_config.sample_size * vae_scale_factor;
-        m_generation_config.width = unet_config.sample_size * vae_scale_factor;
+        m_generation_config = ImageGenerationConfig();
+
+        // in case of image to image, the shape is computed based on initial image
+        if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
+            m_generation_config.height = unet_config.sample_size * vae_scale_factor;
+            m_generation_config.width = unet_config.sample_size * vae_scale_factor;
+        }
 
         if (class_name == "StableDiffusionXLPipeline" || class_name == "StableDiffusionXLImg2ImgPipeline" || class_name == "StableDiffusionXLInpaintPipeline") {
             if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) {
diff --git a/src/cpp/src/image_generation/text2image_pipeline.cpp b/src/cpp/src/image_generation/text2image_pipeline.cpp
index 6ceb076f85..56b02a2e10 100644
--- a/src/cpp/src/image_generation/text2image_pipeline.cpp
+++ b/src/cpp/src/image_generation/text2image_pipeline.cpp
@@ -51,6 +51,34 @@ Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir, co
     }
 }
 
+Text2ImagePipeline::Text2ImagePipeline(const Image2ImagePipeline& pipe) {
+    if (auto stable_diffusion_xl = std::dynamic_pointer_cast<StableDiffusionXLPipeline>(pipe.m_impl); stable_diffusion_xl != nullptr) {
+        m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion_xl);
+    } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
+        m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion);
+    } else if (auto stable_diffusion_3 = std::dynamic_pointer_cast<StableDiffusion3Pipeline>(pipe.m_impl); stable_diffusion_3 != nullptr) {
+        m_impl = std::make_shared<StableDiffusion3Pipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion_3);
+    } else if (auto flux = std::dynamic_pointer_cast<FluxPipeline>(pipe.m_impl); flux != nullptr) {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::TEXT_2_IMAGE, *flux);
+    } else {
+        OPENVINO_ASSERT("Cannot convert specified Image2ImagePipeline to Text2ImagePipeline");
+    }
+}
+
+Text2ImagePipeline::Text2ImagePipeline(const InpaintingPipeline& pipe) {
+    if (auto stable_diffusion_xl = std::dynamic_pointer_cast<StableDiffusionXLPipeline>(pipe.m_impl); stable_diffusion_xl != nullptr) {
+        m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion_xl);
+    } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
+        m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion);
+    } else if (auto stable_diffusion_3 = std::dynamic_pointer_cast<StableDiffusion3Pipeline>(pipe.m_impl); stable_diffusion_3 != nullptr) {
+        m_impl = std::make_shared<StableDiffusion3Pipeline>(PipelineType::TEXT_2_IMAGE, *stable_diffusion_3);
+    } else if (auto flux = std::dynamic_pointer_cast<FluxPipeline>(pipe.m_impl); flux != nullptr) {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::TEXT_2_IMAGE, *flux);
+    } else {
+        OPENVINO_ASSERT("Cannot convert specified InpaintingPipeline to Text2ImagePipeline");
+    }
+}
+
 Text2ImagePipeline::Text2ImagePipeline(const std::shared_ptr<DiffusionPipeline>& impl)
     : m_impl(impl) {
     assert(m_impl != nullptr);
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 8ab0407ea7..829d4844e8 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -772,6 +772,9 @@ class Image2ImagePipeline:
                     device (str): Device to run the model on (e.g., CPU, GPU).
                     kwargs: Image2ImagePipeline properties
         """
+    @typing.overload
+    def __init__(self, pipe: InpaintingPipeline) -> None:
+        ...
     def compile(self, device: str, **kwargs) -> None:
         """
                         Compiles the model.
@@ -868,6 +871,9 @@ class InpaintingPipeline:
                     device (str): Device to run the model on (e.g., CPU, GPU).
                     kwargs: InpaintingPipeline properties
         """
+    @typing.overload
+    def __init__(self, pipe: Image2ImagePipeline) -> None:
+        ...
     def compile(self, device: str, **kwargs) -> None:
         """
                         Compiles the model.
@@ -1535,6 +1541,12 @@ class Text2ImagePipeline:
                     device (str): Device to run the model on (e.g., CPU, GPU).
                     kwargs: Text2ImagePipeline properties
         """
+    @typing.overload
+    def __init__(self, pipe: Image2ImagePipeline) -> None:
+        ...
+    @typing.overload
+    def __init__(self, pipe: InpaintingPipeline) -> None:
+        ...
     def compile(self, device: str, **kwargs) -> None:
         """
                         Compiles the model.
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index 7739b88ff9..55be1708c1 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -85,9 +85,7 @@ void init_image_generation_pipelines(py::module_& m) {
         .def(py::init<>());
 
     py::class_<ov::genai::CppStdGenerator, ov::genai::Generator, std::shared_ptr<ov::genai::CppStdGenerator>>(m, "CppStdGenerator", "This class wraps std::mt19937 pseudo-random generator.")
-        .def(py::init([](
-            uint32_t seed
-        ) {
+        .def(py::init([](uint32_t seed) {
             return std::make_unique<ov::genai::CppStdGenerator>(seed);
         }), 
         py::arg("seed"))
@@ -140,9 +138,7 @@ void init_image_generation_pipelines(py::module_& m) {
         });
 
     auto text2image_pipeline = py::class_<ov::genai::Text2ImagePipeline>(m, "Text2ImagePipeline", "This class is used for generation with text-to-image models.")
-        .def(py::init([](
-            const std::filesystem::path& models_path
-        ) {
+        .def(py::init([](const std::filesystem::path& models_path) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ov::genai::Text2ImagePipeline>(models_path);
         }),
@@ -151,7 +147,6 @@ void init_image_generation_pipelines(py::module_& m) {
             Text2ImagePipeline class constructor.
             models_path (os.PathLike): Path to the folder with exported model files.
         )")
-
         .def(py::init([](
             const std::filesystem::path& models_path,
             const std::string& device,
@@ -211,9 +206,7 @@ void init_image_generation_pipelines(py::module_& m) {
 
 
     auto image2image_pipeline = py::class_<ov::genai::Image2ImagePipeline>(m, "Image2ImagePipeline", "This class is used for generation with image-to-image models.")
-        .def(py::init([](
-            const std::filesystem::path& models_path
-        ) {
+        .def(py::init([](const std::filesystem::path& models_path) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ov::genai::Image2ImagePipeline>(models_path);
         }),
@@ -222,7 +215,6 @@ void init_image_generation_pipelines(py::module_& m) {
             Image2ImagePipeline class constructor.
             models_path (os.PathLike): Path to the folder with exported model files.
         )")
-
         .def(py::init([](
             const std::filesystem::path& models_path,
             const std::string& device,
@@ -277,9 +269,7 @@ void init_image_generation_pipelines(py::module_& m) {
 
 
     auto inpainting_pipeline = py::class_<ov::genai::InpaintingPipeline>(m, "InpaintingPipeline", "This class is used for generation with inpainting models.")
-        .def(py::init([](
-            const std::filesystem::path& models_path
-        ) {
+        .def(py::init([](const std::filesystem::path& models_path) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ov::genai::InpaintingPipeline>(models_path);
         }),
@@ -288,7 +278,6 @@ void init_image_generation_pipelines(py::module_& m) {
             InpaintingPipeline class constructor.
             models_path (os.PathLike): Path to the folder with exported model files.
         )")
-
         .def(py::init([](
             const std::filesystem::path& models_path,
             const std::string& device,
@@ -342,4 +331,25 @@ void init_image_generation_pipelines(py::module_& m) {
             py::arg("mask_image"), "Mask image",
             (text2image_generate_docstring + std::string(" \n ")).c_str())
         .def("decode", &ov::genai::InpaintingPipeline::decode, py::arg("latent"));
+
+    // define constructors to create one pipeline from another
+    // NOTE: needs to be defined once all pipelines are created
+
+    text2image_pipeline
+        .def(py::init([](const ov::genai::Image2ImagePipeline& pipe) {
+            return std::make_unique<ov::genai::Text2ImagePipeline>(pipe);
+        }), py::arg("pipe"))
+        .def(py::init([](const ov::genai::InpaintingPipeline& pipe) {
+            return std::make_unique<ov::genai::Text2ImagePipeline>(pipe);
+        }), py::arg("pipe"));
+
+    image2image_pipeline
+        .def(py::init([](const ov::genai::InpaintingPipeline& pipe) {
+            return std::make_unique<ov::genai::Image2ImagePipeline>(pipe);
+        }), py::arg("pipe"));
+
+    inpainting_pipeline
+        .def(py::init([](const ov::genai::Image2ImagePipeline& pipe) {
+            return std::make_unique<ov::genai::InpaintingPipeline>(pipe);
+        }), py::arg("pipe"));
 }

From 095840028c8f412ac7ed6e7c51c2f5b1fbf65853 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 14 Dec 2024 00:48:08 +0400
Subject: [PATCH 06/18] Fixed typo in image generation readme (#1384)

Sync with C++ readme

https://github.com/openvinotoolkit/openvino.genai/blob/d189eb7541a61a41581dd21361db3aa3884d211b/samples/cpp/image_generation/README.md?plain=1#L121-L125
---
 samples/python/image_generation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 321f3f6d05..33da6bd43a 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -112,7 +112,7 @@ To run the sample, download initial image first:
 
 And then run the sample:
 
-`python image2mage.py ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting' small_city.bmp`
+`python image2mage.py ./dreamlike_anime_1_0_ov/FP16 'cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k' cat.png`
 
 The resuling image is:
 

From 8045cf0d6340312aebaeb73cd927f9375079a8ee Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 14 Dec 2024 00:50:55 +0400
Subject: [PATCH 07/18] GHA: use OpenVINO GenAI from PR in LLM bench / WWB
 tests (#1376)

- Install OpenVINO GenAI from source code in LLM bench pipeline
- Use OpenVINO provider to ensure OpenVINO GenAI is built on the same
platform as OpenVINO
- Changed `dreamlike-anime` to `OpenVINO/LCM_Dreamshaper_v7-int8-ov` to
save conversion time
---
 .github/workflows/linux.yml            |   2 -
 .github/workflows/llm_bench-python.yml | 265 +++++++++++++++++--------
 2 files changed, 180 insertions(+), 87 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 18107aa203..8d596aed56 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -34,7 +34,6 @@ jobs:
       status: ${{ steps.openvino_download.outcome }}
       ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
       ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
-      ov_version: ${{ steps.openvino_download.outputs.ov_version }}
       docker_tag: ${{ steps.get_docker_tag.outputs.docker_tag }}
     timeout-minutes: 10
     defaults:
@@ -147,7 +146,6 @@ jobs:
       OV_INSTALL_DIR: ${{ github.workspace }}/ov
       INSTALL_DIR: ${{ github.workspace }}/install
       WHEELS_DIR: ${{ github.workspace }}/install/wheels
-      BUILD_DIR: ${{ github.workspace }}/build
       SRC_DIR: ${{ github.workspace }}/src
 
     steps:
diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 2c8f6a358a..f0df824efa 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -19,105 +19,200 @@ concurrency:
   group: ${{ github.event_name == 'push' && github.run_id || github.ref }}-llm-bench-python
   cancel-in-progress: true
 
-env:
-  LLM_BENCH_PYPATH: tools/llm_bench
-  WWB_PATH: tools/who_what_benchmark
-
 jobs:
+  openvino_download:
+    name: Download OpenVINO
+    outputs:
+      status: ${{ steps.openvino_download.outcome }}
+      ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
+      ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
+      ov_version: ${{ steps.openvino_download.outputs.ov_version }}
+      docker_tag: ${{ steps.get_docker_tag.outputs.docker_tag }}
+    timeout-minutes: 10
+    defaults:
+      run:
+        shell: bash
+    runs-on: aks-linux-2-cores-8gb
+    container:
+      image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
+      volumes: 
+        - /mount:/mount
+        - ${{ github.workspace }}:${{ github.workspace }}
+
+    steps:
+    - uses: openvinotoolkit/openvino/.github/actions/openvino_provider@master
+      id: openvino_download
+      with:
+        platform: ubuntu22
+        commit_packages_to_provide: wheels
+        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+
+    - name: Clone docker tag from OpenVINO repo
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        repository: 'openvinotoolkit/openvino'
+        path: 'openvino'
+        ref: ${{ env.OV_BRANCH }}
+        sparse-checkout: |
+          .github/dockerfiles/docker_tag
+
+    - name: Save docker tag to output
+      id: get_docker_tag
+      run: |
+        docker_tag=$(cat openvino/.github/dockerfiles/docker_tag)
+        echo "docker_tag=$docker_tag" >> $GITHUB_OUTPUT
+
   build:
-    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    runs-on: ubuntu-22.04
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.11"]
+    needs: [ openvino_download ]
+    env:
+      OV_INSTALL_DIR: ${{ github.workspace }}/ov
+      SRC_DIR: ${{ github.workspace }}
+      LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
+      WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
+
     steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest black
-        GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt
-        python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
-        python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
-    - name: Create code style diff for samples
-      if: failure()
-      run: |
-        python -m black -l 160 -S ${{ env.LLM_BENCH_PYPATH }}/
-        git diff > llm.bench_diff.diff
-    - uses: actions/upload-artifact@v3
-      if: failure()
-      with:
-        name: llm.bench_diff
-        path: llm.bench_diff.diff
-    - name: Test native pytorch model on Linux
-      run: |
-        git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
-        python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
-      env:
-        GIT_LFS_SKIP_SMUDGE: 0
-    - name: Test tiny-random-baichuan2 on Linux Optimum Intel
-      run: |
-        optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum
-    - name: Test tiny-stable-diffusion on Linux Optimum Intel
-      run: |
-        optimum-cli export openvino --model segmind/tiny-sd --trust-remote-code --weight-format fp16 ./ov_models/tiny-sd/pytorch/dldt/FP16/
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum
-    - name: Test dreamlike-anime on Linux with GenAI
-      run: |
-        optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 ov_models/dreamlike-art-dreamlike-anime-1.0/FP16
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1
-    - name: Test dreamlike-anime on Linux with GenAI and LoRA
-      run: |
-        wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
-    - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
-      run: |
-        optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
-        optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1  --assistant_confidence_threshold 0.4
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1  --num_assistant_tokens 5
-    - name: Test whisper-tiny on Linux
-      run: |
-        GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
-        cd multilingual_librispeech
-        git lfs pull -I /data/mls_polish/train/audio/3283_1447_000.tar.gz
-        mkdir data/mls_polish/train/audio/3283_1447_000
-        tar zxvf data/mls_polish/train/audio/3283_1447_000.tar.gz -C data/mls_polish/train/audio/3283_1447_000/
-        cd ..
-        optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum
-        python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
-    - name: WWB Tests
-      run: |
-        pip install git+https://github.com/huggingface/optimum-intel.git
-        GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
-        python -m pytest -v ${{ env.WWB_PATH }}/tests
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          submodules: recursive
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: ${{ needs.openvino_download.outputs.ov_artifact_name }}
+          path: ${{ env.OV_INSTALL_DIR }}
+          merge-multiple: true
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest black
+          python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+        working-directory: ${{ env.OV_INSTALL_DIR }}
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          python -m flake8 ${{ env.LLM_BENCH_PYPATH }} --config=${{ env.LLM_BENCH_PYPATH }}/setup.cfg
+          python -m flake8 ${{ env.WWB_PATH }} --config=${{ env.WWB_PATH }}/setup.cfg
+      - name: Create code style diff for samples
+        if: failure()
+        run: |
+          python -m black -l 160 -S ${{ env.LLM_BENCH_PYPATH }}/
+          git diff > llm.bench_diff.diff
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        if: failure()
+        with:
+          name: llm.bench_diff
+          path: llm.bench_diff.diff
+      - name: Test native pytorch model on Linux
+        run: |
+          git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
+          python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
+        env:
+          GIT_LFS_SKIP_SMUDGE: 0
+      - name: Test tiny-random-baichuan2 on Linux Optimum Intel
+        run: |
+          optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 --optimum
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
+        run: |
+          huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
+        run: |
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1
+      - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
+        run: |
+          wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
+      - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
+        run: |
+          optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
+          optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --assistant_confidence_threshold 0.4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --num_assistant_tokens 5
+      - name: Test whisper-tiny on Linux
+        run: |
+          GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
+          cd multilingual_librispeech
+          git lfs pull -I /data/mls_polish/train/audio/3283_1447_000.tar.gz
+          mkdir data/mls_polish/train/audio/3283_1447_000
+          tar zxvf data/mls_polish/train/audio/3283_1447_000.tar.gz -C data/mls_polish/train/audio/3283_1447_000/
+          cd ..
+          optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
+      - name: WWB Tests
+        run: |
+          pip install git+https://github.com/huggingface/optimum-intel.git
+          GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
+          python -m pytest -v ${{ env.WWB_PATH }}/tests
+
   stateful:
-    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash
+    runs-on: ubuntu-22.04
+    needs: [ openvino_download ]
+    env:
+      OV_INSTALL_DIR: ${{ github.workspace }}/ov
+      SRC_DIR: ${{ github.workspace }}
+      LLM_BENCH_PYPATH: ${{ github.workspace }}/tools/llm_bench
+      WWB_PATH: ${{ github.workspace }}/tools/who_what_benchmark
+
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          submodules: recursive
+      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: "3.11"
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
-          python-version: "3.10"
+          name: ${{ needs.openvino_download.outputs.ov_artifact_name }}
+          path: ${{ env.OV_INSTALL_DIR }}
+          merge-multiple: true
       - name: Test stateful
         run: |
-          GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r tools/llm_bench/requirements.txt
-          python -m pip uninstall --yes openvino
-          python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python tools/llm_bench/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful
-          grep beam_idx pytorch/dldt/FP32/openvino_model.xml
+          python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
+          python ${{ env.LLM_BENCH_PYPATH }}/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ${{ env.SRC_DIR }} --stateful
+          grep beam_idx ${{ env.SRC_DIR }}/pytorch/dldt/FP32/openvino_model.xml
+        working-directory: ${{ env.OV_INSTALL_DIR }}
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
       - name: WWB Tests
         run: |
           pip install pytest
           pip install git+https://github.com/huggingface/optimum-intel.git
           GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
+
+  Overall_Status:
+    name: ci/gha_overall_status_llm_bench
+    needs: [openvino_download, build, stateful]
+    if: ${{ always() }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check status of all jobs
+        if: >-
+          ${{
+            contains(needs.*.result, 'failure') ||
+            contains(needs.*.result, 'cancelled')
+          }}
+        run: exit 1

From 669588df2534a3ba96f9589f3645269b4d1f88c9 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Sat, 14 Dec 2024 00:51:28 +0400
Subject: [PATCH 08/18] LLM Inference Guide -> Generative AI workflow (#1383)

---
 README.md                 | 10 +++++-----
 tools/llm_bench/README.md |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index c00971a4e3..4892c86f10 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ Continuous batching functionality is used within OpenVINO Model Server (OVMS) to
 ## Performing text generation 
 <details>
 
-For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+For more examples check out our [Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 
 ### Converting and compressing text generation model from Hugging Face library
 
@@ -103,7 +103,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Create+a
 ## Performing visual language text generation
 <details>
 
-For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+For more examples check out our [Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 
 ### Converting and compressing the model from Hugging Face library
 
@@ -173,7 +173,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Visual-l
 
 <details>
 
-For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+For more examples check out our [Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 
 ### Converting and compressing image generation model from Hugging Face library
 
@@ -335,7 +335,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Text+to+
 ## Speech-to-text processing using Whisper Pipeline
 <details>
 
-For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+For more examples check out our [Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 
 NOTE: Whisper Pipeline requires preprocessing of audio input (to adjust sampling rate and normalize)
  
@@ -397,7 +397,7 @@ See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Automati
 ## Additional materials
 
 - [List of supported models](https://github.com/openvinotoolkit/openvino.genai/blob/master/src/docs/SUPPORTED_MODELS.md) (NOTE: models can work, but were not tried yet)
-- [OpenVINO LLM inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
+- [OpenVINO Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html)
 - [Optimum-intel and OpenVINO](https://huggingface.co/docs/optimum/intel/openvino/export)
 
 ## License
diff --git a/tools/llm_bench/README.md b/tools/llm_bench/README.md
index bcb7436189..d0ce53145d 100755
--- a/tools/llm_bench/README.md
+++ b/tools/llm_bench/README.md
@@ -32,7 +32,7 @@ huggingface-cli login
 The `optimum-cli` tool simplifies converting Hugging Face models to OpenVINO IR format. 
 - Detailed documentation can be found in the [Optimum-Intel documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/export). 
 - To learn more about weight compression, see the [NNCF Weight Compression Guide](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html).
-- For additional guidance on running inference with OpenVINO for LLMs, see the [OpenVINO LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html).
+- For additional guidance on running inference with OpenVINO for LLMs, see the [OpenVINO Generative AI workflow](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html).
 
 **Usage:**
 

From c77f7c93f5f82cde4988a8ef1b3ca204d3d6873d Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 14 Dec 2024 02:38:32 +0400
Subject: [PATCH 09/18] GHA: use OpenVINO provider for SD (#1386)

---
 .github/workflows/llm_bench-python.yml        |  20 ----
 .../workflows/stable_diffusion_1_5_cpp.yml    | 103 ++++++++++++++----
 2 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index f0df824efa..8b022f27e0 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -27,7 +27,6 @@ jobs:
       ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
       ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
       ov_version: ${{ steps.openvino_download.outputs.ov_version }}
-      docker_tag: ${{ steps.get_docker_tag.outputs.docker_tag }}
     timeout-minutes: 10
     defaults:
       run:
@@ -47,21 +46,6 @@ jobs:
         commit_packages_to_provide: wheels
         revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
 
-    - name: Clone docker tag from OpenVINO repo
-      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      with:
-        repository: 'openvinotoolkit/openvino'
-        path: 'openvino'
-        ref: ${{ env.OV_BRANCH }}
-        sparse-checkout: |
-          .github/dockerfiles/docker_tag
-
-    - name: Save docker tag to output
-      id: get_docker_tag
-      run: |
-        docker_tag=$(cat openvino/.github/dockerfiles/docker_tag)
-        echo "docker_tag=$docker_tag" >> $GITHUB_OUTPUT
-
   build:
     defaults:
       run:
@@ -100,8 +84,6 @@ jobs:
           python -m pip install ${{ env.SRC_DIR }} -v ${{ needs.openvino_download.outputs.ov_wheel_source }}
           GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt ${{ needs.openvino_download.outputs.ov_wheel_source }}
         working-directory: ${{ env.OV_INSTALL_DIR }}
-        env:
-          CMAKE_BUILD_PARALLEL_LEVEL: 4
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names
@@ -194,8 +176,6 @@ jobs:
           python ${{ env.LLM_BENCH_PYPATH }}/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ${{ env.SRC_DIR }} --stateful
           grep beam_idx ${{ env.SRC_DIR }}/pytorch/dldt/FP32/openvino_model.xml
         working-directory: ${{ env.OV_INSTALL_DIR }}
-        env:
-          CMAKE_BUILD_PARALLEL_LEVEL: 4
       - name: WWB Tests
         run: |
           pip install pytest
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index b355cd4f09..497bfbff3e 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -17,29 +17,83 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  PYTHON_VERSION: '3.10'
-  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241205_x86_64.tgz
-  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip
+  PYTHON_VERSION: '3.11'
   OV_INSTALL_DIR: ${{ github.workspace }}/ov
 
 jobs:
+  openvino_download_linux:
+    name: Download OpenVINO for Linux
+    outputs:
+      status: ${{ steps.openvino_download.outcome }}
+      ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
+      ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
+      ov_version: ${{ steps.openvino_download.outputs.ov_version }}
+    timeout-minutes: 10
+    defaults:
+      run:
+        shell: bash
+    runs-on: aks-linux-2-cores-8gb
+    container:
+      image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
+      volumes: 
+        - /mount:/mount
+        - ${{ github.workspace }}:${{ github.workspace }}
+
+    steps:
+    - uses: openvinotoolkit/openvino/.github/actions/openvino_provider@master
+      id: openvino_download
+      with:
+        platform: ubuntu22
+        commit_packages_to_provide: wheels
+        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+
+  openvino_download_windows:
+    name: Download OpenVINO for Windows
+    outputs:
+      status: ${{ steps.openvino_download.outcome }}
+      ov_artifact_name: ${{ steps.openvino_download.outputs.ov_artifact_name }}
+      ov_wheel_source: ${{ steps.openvino_download.outputs.ov_wheel_source }}
+      ov_version: ${{ steps.openvino_download.outputs.ov_version }}
+    timeout-minutes: 10
+    defaults:
+      run:
+        shell: bash
+    runs-on: aks-linux-2-cores-8gb
+    container:
+      image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
+      volumes: 
+        - /mount:/mount
+        - ${{ github.workspace }}:${{ github.workspace }}
+
+    steps:
+    - uses: openvinotoolkit/openvino/.github/actions/openvino_provider@master
+      id: openvino_download
+      with:
+        platform: windows
+        commit_packages_to_provide: wheels
+        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+
   stable_diffusion_1_5_cpp-linux:
-    runs-on: ubuntu-20.04-8-cores
+    runs-on: ubuntu-22.04-8-cores
+    needs: [ openvino_download_linux ]
     defaults:
       run:
         shell: bash -l {0}
     env:
       build_dir: ${{ github.workspace }}//build
+      SRC_DIR: ${{ github.workspace }}
+
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: recursive
 
-      - name: Download OpenVINO archive
-        run: |
-          wget ${{ env.LINUX_OV_ARCHIVE_URL}} --progress=bar:force:noscroll -O openvino_package.tar.gz
-          mkdir ${{ env.OV_INSTALL_DIR }}
-          tar -xzf openvino_package.tar.gz -C ${{ env.OV_INSTALL_DIR }} --strip-components=1
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: ${{ needs.openvino_download_linux.outputs.ov_artifact_name }}
+          path: ${{ env.OV_INSTALL_DIR }}
+          merge-multiple: true
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
@@ -58,9 +112,10 @@ jobs:
 
       - name: Install python dependencies
         run: |
-          source openvino_sd_cpp/bin/activate
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ./samples/requirements.txt
+          source ${{ github.workspace }}/openvino_sd_cpp/bin/activate
+          python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers/[transformers] ${{ needs.openvino_download_linux.outputs.ov_wheel_source }}
+          python -m pip install -r ${{ env.SRC_DIR }}/samples/requirements.txt
+        working-directory: ${{ env.OV_INSTALL_DIR }}
 
       - name: Download and convert models and tokenizer
         run: |
@@ -95,25 +150,26 @@ jobs:
           PYTHONPATH: ${{ env.build_dir }}
 
   stable_diffusion_1_5_cpp-windows:
+    needs: [ openvino_download_windows ]
     runs-on: windows-2019
     defaults:
       run:
         shell: pwsh
     env:
       build_dir: ${{ github.workspace }}\build
+      SRC_DIR: ${{ github.workspace }}
+
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: recursive
 
-      - name: Download OpenVINO archive
-        run: |
-          mkdir ${{ env.OV_INSTALL_DIR }}
-          pushd ${{ env.OV_INSTALL_DIR }}
-            Invoke-WebRequest "${{ env.WINDOWS_OV_ARCHIVE_URL}}" -OutFile "openvino_package.zip"
-            Expand-Archive openvino_package.zip -DestinationPath ./tmp
-            mv ./tmp/*/* .
-          popd
+      - name: Download OpenVINO package
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: ${{ needs.openvino_download_windows.outputs.ov_artifact_name }}
+          path: ${{ env.OV_INSTALL_DIR }}
+          merge-multiple: true
 
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
@@ -132,9 +188,10 @@ jobs:
 
       - name: Install python dependencies
         run: |
-          . "./openvino_sd_cpp/Scripts/Activate.ps1"
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ./samples/requirements.txt
+          . "${{ github.workspace }}/openvino_sd_cpp/Scripts/Activate.ps1"
+          python -m pip install ${{ env.SRC_DIR }}/thirdparty/openvino_tokenizers/[transformers] ${{ needs.openvino_download_windows.outputs.ov_wheel_source }}
+          python -m pip install -r ${{ env.SRC_DIR }}/samples/requirements.txt
+        working-directory: ${{ env.OV_INSTALL_DIR }}
 
       - name: Download and convert models and tokenizer
         run: |

From 4a7374bc1533466a159477760bf1cee1c1b10443 Mon Sep 17 00:00:00 2001
From: Dmitriy Pastushenkov <dmitriy.pastushenkov@intel.com>
Date: Mon, 16 Dec 2024 11:16:19 +0100
Subject: [PATCH 10/18] fix some typos in image2image sample readme (#1388)

fix some typos in image2image sample readme
---
 samples/python/image_generation/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 33da6bd43a..0ddf57d882 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -108,11 +108,11 @@ Also, `strength` parameter linearly affects a number of inferenece steps, becaus
 
 To run the sample, download initial image first:
 
-`wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png`
+`wget -O cat.png https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png`
 
 And then run the sample:
 
-`python image2mage.py ./dreamlike_anime_1_0_ov/FP16 'cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k' cat.png`
+`python image2image.py ./dreamlike_anime_1_0_ov/FP16 'cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k' cat.png`
 
 The resuling image is:
 

From 8ce5eb389179ba82da6523f849944ea3dc8c93e0 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Mon, 16 Dec 2024 15:49:38 +0400
Subject: [PATCH 11/18] Update streaming in LM Encoding & CB (#1377)

---
 src/cpp/src/continuous_batching_impl.cpp      |  8 +++---
 src/cpp/src/lm_encoding.cpp                   | 25 +++++++++++--------
 .../speculative_decoding_impl.cpp             |  2 --
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index d27e8934dc..1e42f5b2d9 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -285,9 +285,11 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         }
         if (streamer_ptr && generations.at(0)->can_read()) {
             std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
-            OPENVINO_ASSERT(1 == token.size());
-            OPENVINO_ASSERT(1 == token.begin()->second.generated_ids.size());
-            continue_generation = !streamer_ptr->put(token.begin()->second.generated_ids.at(0));
+            for (const auto& gen_token : token.begin()->second.generated_ids) {
+                if (!streamer_ptr->put(gen_token)) {
+                    break;
+                }
+            }
         }
     }
 
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index c76d9f7edf..3ab041fa58 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -125,6 +125,17 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
                                                 active_sequence_groups.end(),
                                                 get_active_sequence_groups),
                                  active_sequence_groups.end());
+    
+    auto stream_generated_tokens = [&streamer_ptr, &generations]() {
+        if (streamer_ptr && generations.at(0).get()->can_read()) {
+            std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
+            for (const auto& gen_token : token.begin()->second.generated_ids) {
+                if (!streamer_ptr->put(gen_token)) {
+                    break;
+                }
+            }
+        }
+    };
 
     while (active_sequence_groups.size() > 0) {
         size_t total_num_tokens = 0;
@@ -202,13 +213,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         raw_perf_counters.m_new_token_times.emplace_back(infer_end);
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
-        if (streamer_ptr) {
-            // stream data from first sequence
-            int64_t out_token = sequence_groups.at(0).get()->operator[](0)->get_generated_ids().back();
-            if (streamer_ptr->put(out_token)) {
-                break;
-            }
-        }
+        stream_generated_tokens();
 
         sampler_output = sampler.sample(active_sequence_groups, m_llm.get_tensor("logits"));
 
@@ -218,9 +223,9 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
                                     active_sequence_groups.end());
     }
 
+    // to stream last token
+    stream_generated_tokens();
     if (streamer_ptr) {
-        int64_t out_token = sequence_groups.at(0).get()->operator[](0)->get_generated_ids().back();
-        streamer_ptr->put(out_token);
         streamer_ptr->end();
     }
     
@@ -246,4 +251,4 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
 }
 
 }  // namespace genai
-}  // namespace ov
+}  // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index 2be67320a9..e4f3b1ad1f 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -232,8 +232,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
                 continue;
             }
             std::unordered_map<uint64_t, GenerationOutput> token = main_generations.at(0).get()->back();
-            OPENVINO_ASSERT(1 <= token.size());
-            OPENVINO_ASSERT(1 <= token.begin()->second.generated_ids.size());
             for (const auto& gen_token : token.begin()->second.generated_ids) {
                 continue_generation = !streamer_ptr->put(gen_token);
                 if (!continue_generation) {

From 9e9b409a871ac75fe9d515d6ae6d68d882350b35 Mon Sep 17 00:00:00 2001
From: Sofya Balandina <sofya.balandina@intel.com>
Date: Mon, 16 Dec 2024 13:02:45 +0000
Subject: [PATCH 12/18] Use whole history in case of undetermined tokenization
 of sequence (#1254)

Task: [CVS-157295](https://jira.devtools.intel.com/browse/CVS-157295)

- fist commit is cherry-pick from
https://github.com/openvinotoolkit/openvino.genai/pull/1268 and
https://github.com/openvinotoolkit/openvino.genai/pull/1361
- next commit includes applying comments from
https://github.com/openvinotoolkit/openvino.genai/pull/1268 and adding
usage of kv cache for LLM
---
 src/cpp/src/llm_pipeline.cpp                  | 108 +++++++++++++++---
 src/cpp/src/utils.cpp                         |  75 ++++++++++++
 src/cpp/src/utils.hpp                         |  11 ++
 .../src/visual_language/inputs_embedder.cpp   |  73 +++++++++---
 .../src/visual_language/inputs_embedder.hpp   |   7 ++
 src/cpp/src/visual_language/pipeline.cpp      |  27 ++++-
 6 files changed, 264 insertions(+), 37 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 84f76730eb..f663b27dd9 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -36,13 +36,15 @@ std::pair<EncodedResults, int32_t> beam_search(
 class StatefulLLMPipeline final : public LLMPipelineImplBase {
 public:
     ov::InferRequest m_model_runner;
-
     bool is_chat_conversation = false;
-    bool m_is_cache_empty = true;
+    bool m_trust_encoded_history = true;
     std::optional<int32_t> m_selected_beam = std::nullopt;
     ChatHistory m_history;
     std::string m_templated_chat_history = {};
-    TokenizedInputs m_tokenized_chat_history;
+    std::vector<int64_t> m_tokenized_chat_history;
+    ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+    size_t m_to_remove_from_hist = 0;
+    size_t m_kv_cache_seq_length_axis = 2;
 
     StatefulLLMPipeline(
         const ov::InferRequest& request,
@@ -77,6 +79,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         ov::Core core;
         auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
         utils::slice_matmul_statefull_model(model);
+        m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model);
 
         if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
             m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
@@ -102,8 +105,20 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         OptionalGenerationConfig generation_config,
         StreamerVariant streamer
     ) override {
+        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+            m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING;
+
+        if (is_chat_conversation)
+            OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS,
+                            "Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");
+
         auto start_time = std::chrono::steady_clock::now();
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+        // If eos_token_id was not provided, take value from default m_generation_config
+        if (config.eos_token_id == -1)
+            config.set_eos_token_id(m_generation_config.eos_token_id);
+        config.validate();
+
         TokenizedInputs encoded_input;
 
         if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
@@ -127,19 +142,51 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                 auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
                 // Do not add special tokens in chat scenario to be aligned with HF.
                 auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
-                if (m_is_cache_empty) {
+                auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+
+                // some symbols combinations can be encoded by the tokenizer in different ways
+                // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
+                // so let's check it out, find the trusted part and use it in on the next step
+                size_t last_same_hist_token = 0;
+                if (!m_tokenized_chat_history.empty()) {
+                    std::set<int64_t> stop_tokens = config.stop_token_ids;
+                    last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+                    m_trust_encoded_history = last_same_hist_token == SIZE_MAX;
+                }
+
+                if (m_tokenized_chat_history.empty()) {
                     encoded_input = new_chat_tokens;
+                } else if (last_same_hist_token != SIZE_MAX) {
+                    m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+
+                    ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token},
+                                                       new_chat_tokens.input_ids.data<int64_t>() + last_same_hist_token);
+
+                    ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
+                    std::fill_n(new_attention_mask.data<int64_t>(), new_tensor.get_shape()[1], 1);
+
+                    encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
+                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token});
+                    new_tensor.copy_to(encoded_input.input_ids);
+                    encoded_input.attention_mask = new_attention_mask;
+
+                    m_selected_beam = std::nullopt;
                 } else {
-                    auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
                     encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
                 }
                 m_templated_chat_history = new_templated_chat_history;
-                m_tokenized_chat_history = new_chat_tokens;
+                m_tokenized_chat_history.clear();
+                m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
+                std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
+                            std::back_inserter(m_tokenized_chat_history));
+
                 // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
             } else {
                 encoded_input = m_tokenizer.encode(prompt);
             }
         }
+
         auto encode_stop_time =  std::chrono::steady_clock::now();
         auto encoded_results = generate(encoded_input, config, streamer);
 
@@ -188,6 +235,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         OptionalGenerationConfig generation_config,
         StreamerVariant streamer
     ) override {
+        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
+            m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS;
+
+        if (is_chat_conversation)
+            // if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
+            OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
+                            "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");
+
         auto start_time = std::chrono::steady_clock::now();
         ov::Tensor input_ids;
         ov::Tensor attention_mask;
@@ -199,6 +254,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             attention_mask = data->attention_mask;
         }
 
+        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+            std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
+
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
 
         // If eos_token_id was not provided, take value from default m_generation_config
@@ -230,16 +288,17 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                         "(input_ids, attention_mask, position_ids, beam_idx) "
                         "but you have '" + std::to_string(num_inputs) + "' inputs");
 
+        ov::genai::utils::trim_kv_cache(m_model_runner, m_to_remove_from_hist, m_kv_cache_seq_length_axis, m_adapter_controller);
 
         size_t kv_cache_len = 0;
         ov::Tensor concatenated_attention_mask;
-        if (is_chat_conversation && !m_is_cache_empty) {
+        if (is_chat_conversation && !m_tokenized_chat_history.empty()) {
             OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
             // If history is saved in KV cache, concatenate new attention_mask with the already existing.
             // Between subsequent runs attention_mask should not be modified.
             auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
             auto prompt_len = attention_mask.get_shape()[1];
-            kv_cache_len = atten_mask_history.get_shape()[1];
+            kv_cache_len = atten_mask_history.get_shape()[1] - m_to_remove_from_hist;
 
             ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
             auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam);
@@ -263,6 +322,11 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             m_adapter_controller->apply(m_model_runner, config.adapters);
         }
 
+        if (is_chat_conversation && !m_trust_encoded_history) {
+            m_trust_encoded_history = true;
+            m_to_remove_from_hist = 0;
+        }
+
         ov::genai::EncodedResults result;
         if (config.is_beam_search() && is_chat_conversation) {
             std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask,
@@ -274,8 +338,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
             for (size_t request_id = 0; request_id < batch_size; request_id++) {
                 SequenceGroup::Ptr sequence_group;
-                if (is_chat_conversation && !m_is_cache_empty) {
-                    sequence_group = std::make_shared<SequenceGroup>(request_id, m_tokenized_chat_history.input_ids, config, block_size, enable_prefix_caching);
+                if (is_chat_conversation) {
+                    ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
+                    sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
                 } else {
                     size_t seq_len = input_ids.get_shape().at(1);
                     size_t batch_offset = request_id * seq_len;
@@ -294,12 +359,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                                                                                   sampler, requests, position_ids, std::nullopt, m_selected_beam);
         }
 
-        if (!is_chat_conversation) {
+        if (is_chat_conversation) {
+            std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+        } else {
             reset_kv_state();
             m_selected_beam = std::nullopt;
-        } else {
-            m_is_cache_empty = false;
         }
+
         auto stop_time = std::chrono::steady_clock::now();
 
         // If is called without tokenization then that stat will not be reported.
@@ -313,12 +379,15 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
     void start_chat(const std::string& system_message) override {
         is_chat_conversation = true;
-        m_selected_beam  = std::nullopt;
-        if (!m_is_cache_empty) {
+        m_selected_beam = std::nullopt;
+        m_trust_encoded_history = true;
+        m_to_remove_from_hist = 0;
+        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
-            m_is_cache_empty = true;
             m_history = {};
             m_templated_chat_history = "";
+            m_tokenized_chat_history.clear();
         }
         if (system_message.empty())
             return;
@@ -332,11 +401,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
     void finish_chat() override {
         is_chat_conversation = false;
         m_selected_beam = std::nullopt;
-        if (!m_is_cache_empty) {
+        m_trust_encoded_history = true;
+        m_to_remove_from_hist = 0;
+        m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
-            m_is_cache_empty = true;
             m_history.clear();
             m_templated_chat_history.clear();
+            m_tokenized_chat_history.clear();
         }
     }
 };
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 337b0ab47e..3690920295 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -13,6 +13,8 @@
 #include "openvino/op/tanh.hpp"
 #include "openvino/op/transpose.hpp"
 
+#include "sampler.hpp"
+
 namespace ov {
 namespace genai {
 namespace utils {
@@ -306,6 +308,79 @@ ov::Core singleton_core() {
     return core;
 }
 
+size_t get_first_history_difference(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history, std::set<int64_t> stop_tokens) {
+    size_t idx = 0;
+    auto encoded_history_data = encoded_history.data<int64_t>();
+    while(idx < encoded_history.get_size() && idx < tokenized_history.size()) {
+        if (encoded_history_data[idx] != tokenized_history[idx])
+            break;
+        idx++;
+    }
+
+    // encoded_history after decode of tokenizer could lose one last token (eos/stop token)
+    if ((idx == tokenized_history.size() && idx == encoded_history.get_size()) ||
+        (encoded_history.get_size() < tokenized_history.size() && idx == tokenized_history.size() - 1 && stop_tokens.find(tokenized_history.back()) != stop_tokens.end()))
+        return SIZE_MAX;
+    else
+        return idx;
+}
+
+size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model) {
+    // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
+    // therefore usually seq_length_axis = 2
+    size_t seq_length_axis = 2;
+
+    // "ReadValue" node is KV cache representation in stateful model
+    std::string kv_node_type_name = std::string(ov::op::v6::ReadValue::get_type_info_static().name);
+
+    for (const auto op : model->get_ops()) {
+        // check input size, as in LoRA adapters case it could be 0
+        if (op->get_type_name() != kv_node_type_name || op->get_input_size() < 1) {
+            continue;
+        }
+
+        // Shape example: [-1,4,0,64]
+        auto shape = op->get_input_partial_shape(0);
+
+        for (size_t i = 0; i < shape.rank().get_length(); i++) {
+            // Find axis = 0. This would be sequence length axis.
+            if (shape[i] == 0) {
+                seq_length_axis = i;
+            }
+        }
+        break;
+    }
+
+    return seq_length_axis;
+}
+
+void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller) {
+    // nothing to trim in this case
+    if (remove_from_end == 0)
+        return;
+
+    auto states = request.query_state();
+    for (auto& state : states) {
+        if(adapter_controller && adapter_controller->has_state_name(state.get_name()))
+            continue;
+
+        ov::Tensor old_tensor = state.get_state();
+        // [BATCH_SIZE, num_kv_heads, seq_len, head_size]
+        auto shape = old_tensor.get_shape();
+        shape[seq_length_axis] -= remove_from_end;
+
+        ov::Coordinate new_shape_begin{0, 0, 0, 0};
+        ov::Coordinate new_shape_end{shape};
+
+        auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end);
+
+        ov::Tensor new_tensor(old_tensor.get_element_type(), shape);
+        trimmed_tensor.copy_to(new_tensor);
+
+        state.set_state(new_tensor);
+    }
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 792987d383..57728cd0dc 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -22,6 +22,11 @@ constexpr bool is_container<T,
     std::void_t<decltype(std::declval<T>().begin()),
                 decltype(std::declval<T>().end())>> = true;
 
+enum class GenerationChatInputsType {
+    UNDEF = 0, // Default value, type of inputs is not defined
+    STRING = 1, // Type of inputs is StringInputs
+    ENCODED_INPUTS = 2, // Type of inputs is EncodedInputs
+};
 
 Tensor init_attention_mask(const Tensor& position_ids);
 
@@ -93,6 +98,12 @@ ov::Core singleton_core();
 template <typename T>
 void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value);
 
+size_t get_first_history_difference(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history, std::set<int64_t> stop_tokens);
+
+size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model);
+
+void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index ced17a2ebd..dfdb1521ef 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -39,8 +39,11 @@ class InputsEmbedder::IInputsEmbedder {
     ChatHistory m_history;
     // Templated chat history
     std::string m_templated_chat_history;
-    // Whether we have computed some inputs already
-    bool m_is_cache_empty = true;
+    // Tokenized chat history
+    std::vector<int64_t> m_tokenized_chat_history;
+    // The number of elements, which need to remove from the end of KV cache
+    // removed elements will be added to inputs_ids
+    size_t m_to_remove_from_hist = 0;
 
 public:
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
@@ -53,12 +56,26 @@ class InputsEmbedder::IInputsEmbedder {
         return m_tokenizer;
     }
 
+    std::vector<int64_t> get_tokenized_chat_history() const {
+        return m_tokenized_chat_history;
+    }
+
+    size_t get_amount_to_remove_from_hist() const {
+        return m_to_remove_from_hist;
+    }
+
+    void update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
+        std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history));
+        m_to_remove_from_hist = 0;
+    }
+
     virtual void start_chat(const std::string& system_message) {
         m_is_chat_conversation = true;
-        if (!m_is_cache_empty) {
+        m_to_remove_from_hist = 0;
+        if (!m_tokenized_chat_history.empty()) {
             m_history.clear();
             m_templated_chat_history.clear();
-            m_is_cache_empty = true;
+            m_tokenized_chat_history.clear();
         }
         if (system_message.empty()) {
             return;
@@ -77,10 +94,11 @@ class InputsEmbedder::IInputsEmbedder {
 
     virtual void finish_chat() {
         m_is_chat_conversation = false;
-        m_is_cache_empty = true;
+        m_to_remove_from_hist = 0;
 
         m_history.clear();
         m_templated_chat_history.clear();
+        m_tokenized_chat_history.clear();
     }
 
 protected:
@@ -92,7 +110,7 @@ class InputsEmbedder::IInputsEmbedder {
         m_vlm_config{vlm_config},
         m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config),
         m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config),
-        m_tokenizer{model_dir.string(), device_config} { }
+        m_tokenizer{model_dir, device_config} { }
     
     IInputsEmbedder(
         const VLMConfig& vlm_config,
@@ -140,15 +158,28 @@ class InputsEmbedder::IInputsEmbedder {
                 new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
             }
             auto start_tokenizer_time = std::chrono::steady_clock::now();
-            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
-            if (m_is_cache_empty) {
+            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
+            TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+
+            // some symbols combinations can be encoded by the tokenizer in different ways
+            // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
+            // so let's check it out, find the trusted part and use it in on the next step
+            size_t last_same_hist_token = 0;
+            if (!m_tokenized_chat_history.empty()) {
+                std::set<int64_t> stop_tokens = {m_tokenizer.get_eos_token_id()};
+                last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+            }
+
+            if (m_tokenized_chat_history.empty()) {
                 encoded_input_ids = new_chat_tokens;
-                // after first `get_inputs_embeds` is called, we supposed LLM is inferred and cache is not empty
-                m_is_cache_empty = false;
+            } else if (last_same_hist_token != SIZE_MAX) {
+                m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+
+                ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
+                                                   {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token},
+                                                   new_chat_tokens.data<int64_t>() + last_same_hist_token);
+                encoded_input_ids = new_tensor;
             } else {
-                TokenizedInputs prev_chat_tokens = m_tokenizer.encode(
-                    m_templated_chat_history
-                );
                 encoded_input_ids = utils::subtract_chat_tokenized_inputs(
                     {new_chat_tokens}, prev_chat_tokens
                 ).input_ids;
@@ -156,6 +187,9 @@ class InputsEmbedder::IInputsEmbedder {
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_templated_chat_history = std::move(new_templated_chat_history);
+            m_tokenized_chat_history.clear();
+            std::copy(new_chat_tokens.data<int64_t>(), new_chat_tokens.data<int64_t>() + new_chat_tokens.get_size(),
+                        std::back_inserter(m_tokenized_chat_history));
         } else {
             auto start_tokenizer_time = std::chrono::steady_clock::now();
             encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
@@ -639,7 +673,6 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
                 merged_idx++;
             }
         }
-
         return merged_embeds;
     }
 };
@@ -1138,6 +1171,18 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const {
     return m_impl->get_embedding_model();
 }
 
+std::vector<int64_t> InputsEmbedder::get_tokenized_chat_history() const {
+    return m_impl->get_tokenized_chat_history();
+}
+
+void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
+    return m_impl->update_tokenized_chat_history(encoded_result);
+}
+
+size_t InputsEmbedder::get_amount_to_remove_from_hist() const {
+    return m_impl->get_amount_to_remove_from_hist();
+}
+
 Tokenizer InputsEmbedder::get_tokenizer() const {
     return m_impl->get_tokenizer();
 }
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
index 0e3a3533e2..5c5b9d2b81 100644
--- a/src/cpp/src/visual_language/inputs_embedder.hpp
+++ b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -40,6 +40,13 @@ class InputsEmbedder {
     // returns tokenizer
     Tokenizer get_tokenizer() const;
 
+    // returns tokenized chat history
+    std::vector<int64_t> get_tokenized_chat_history() const;
+    // add new results to tokenized chat history
+    void update_tokenized_chat_history(std::vector<int64_t> encoded_result);
+    // returns amount of elements, which need to remove from the end of the KV cache
+    size_t get_amount_to_remove_from_hist() const;
+
     // starts chat and adds optional system_message to chat history
     void start_chat(const std::string& system_message);
     // adds currently generated text to chat history
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index f7508acb35..b8e89a8e04 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -64,6 +64,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
     std::shared_ptr<InputsEmbedder> m_inputs_embedder;
     // Load pipeline time
     float m_load_time_ms = 0;
+    // Axis num in kv cache from m_language model, which contains information about history len
+    size_t m_kv_cache_seq_length_axis = 2;
 
     VLMPipelineImpl(
         const std::filesystem::path& models_dir,
@@ -87,9 +89,14 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         m_tokenizer = m_inputs_embedder->get_tokenizer();
         m_embedding = m_inputs_embedder->get_embedding_model();
 
-        m_language = utils::singleton_core().compile_model(
+        auto compiled_language_model = utils::singleton_core().compile_model(
             models_dir / "openvino_language_model.xml", device, properties
-        ).create_infer_request();
+        );
+
+        auto language_model = compiled_language_model.get_runtime_model();
+        m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(language_model);
+
+        m_language = compiled_language_model.create_infer_request();
 
         m_language.get_tensor("attention_mask").set_shape({1, 0});
 
@@ -153,14 +160,20 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
         auto end_get_inputs_embeds = std::chrono::steady_clock::now();
 
+        auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist();
+        ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, m_kv_cache_seq_length_axis, std::nullopt);
+
         Sampler sampler = Sampler(m_tokenizer);
 
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
         size_t block_size = 1; // not used
         bool enable_prefix_caching = false;
-        size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1);
+
+        auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history();
+        size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist;
         size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
+
         ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });
         std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), 0);
 
@@ -185,10 +198,10 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         OPENVINO_ASSERT((generation_config.is_greedy_decoding() || generation_config.is_multinomial() || !streamer_ptr),
                         "Currently streaming is possible only for greedy or multinomial decoding");
 
-        ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, { 1, history_size + inputs_embeds.get_shape()[1] }};
+        ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, { 1, history_size + inputs_embeds_size }};
         std::fill_n(new_atten_mask.data<int64_t>(), new_atten_mask.get_size(), 1);
 
-        ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds.get_shape()[1] }};
+        ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);
 
         ov::genai::EncodedResults encoded_result;
@@ -211,6 +224,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
             m_language.reset_state();
             m_language.get_tensor("attention_mask").set_shape({1, 0});
         }
+
         auto generate_end_time = std::chrono::steady_clock::now();
         decoded.perf_metrics = encoded_result.perf_metrics;
 
@@ -228,6 +242,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         // Evaluate statistics
         decoded.perf_metrics.m_evaluated = false;
         decoded.perf_metrics.evaluate_statistics(generate_start_time);
+
+        m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0]);
+
         return decoded;
     }
 

From 7548c4c49c0a91da12c11faf71658bab8a27a3f9 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 16 Dec 2024 20:11:57 +0400
Subject: [PATCH 13/18] Image generation: added TorchGenerator and rng_seed
 (#1379)

- Added `TorchGenerator` which wraps `torch.Generator`. It throws an
exception is `torch` is not available.
- Added `rng_seed` parameter to `ImageGenerationConfig` which has lower
priority compared with `generator` when they both are specified to
`generate()` or `ImageGenerationConfig::update_generation_config`
---
 README.md                                     |  12 +-
 samples/cpp/image_generation/README.md        |  11 +-
 .../cpp/image_generation/lora_text2image.cpp  |   8 +-
 samples/python/image_generation/README.md     |   8 +-
 samples/python/image_generation/baseline.bmp  |   3 -
 .../heterogeneous_stable_diffusion.py         |   3 +-
 samples/python/image_generation/lora.bmp      |   3 -
 .../image_generation/lora_text2image.py       |  24 +---
 .../python/image_generation/text2image.bmp    |   3 -
 samples/python/image_generation/text2image.py |  15 +-
 .../openvino/genai/generation_config.hpp      |   8 +-
 .../image_generation/generation_config.hpp    |  32 ++++-
 src/cpp/src/generation_config.cpp             |   7 +-
 .../src/image_generation/flux_pipeline.hpp    |   1 -
 .../image_generation/generation_config.cpp    |  28 +++-
 .../stable_diffusion_3_pipeline.hpp           |   6 -
 .../stable_diffusion_pipeline.hpp             |   6 -
 src/python/openvino_genai/__init__.py         |   2 +-
 src/python/openvino_genai/__init__.pyi        |   3 +-
 .../openvino_genai/py_openvino_genai.pyi      |  26 +++-
 src/python/py_image_generation_pipelines.cpp  | 134 +++++++++++++++---
 .../whowhatbench/text2image_evaluator.py      |  13 +-
 22 files changed, 223 insertions(+), 133 deletions(-)
 delete mode 100644 samples/python/image_generation/baseline.bmp
 delete mode 100644 samples/python/image_generation/lora.bmp
 delete mode 100644 samples/python/image_generation/text2image.bmp

diff --git a/README.md b/README.md
index 4892c86f10..680bc9bc65 100644
--- a/README.md
+++ b/README.md
@@ -194,12 +194,7 @@ import openvino_genai
 
 device = 'CPU'  # GPU can be used as well
 pipe = openvino_genai.Text2ImagePipeline("./dreamlike_anime_1_0_ov/INT8", device)
-image_tensor = pipe.generate(
-    "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting",
-    width=512,
-    height=512,
-    num_inference_steps=20
-)
+image_tensor = pipe.generate("cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting")
 
 image = Image.fromarray(image_tensor.data[0])
 image.save("image.bmp")
@@ -218,10 +213,7 @@ int main(int argc, char* argv[]) {
    const std::string device = "CPU";  // GPU can be used as well
 
    ov::genai::Text2ImagePipeline pipe(models_path, device);
-   ov::Tensor image = pipe.generate(prompt,
-        ov::genai::width(512),
-        ov::genai::height(512),
-        ov::genai::num_inference_steps(20));
+   ov::Tensor image = pipe.generate(prompt);
 
    imwrite("image.bmp", image, true);
 }
diff --git a/samples/cpp/image_generation/README.md b/samples/cpp/image_generation/README.md
index 8a5cc5aa19..f8dc21cc39 100644
--- a/samples/cpp/image_generation/README.md
+++ b/samples/cpp/image_generation/README.md
@@ -20,6 +20,10 @@ Users can change the sample code and play with the following generation paramete
 - Apply multiple different LoRA adapters and mix them with different blending coefficients
 - (Image to image and inpainting) Play with `strength` parameter to control how initial image is noised and reduce number of inference steps
 
+
+> [!NOTE]
+> Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample: C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor` (uses `torch.Generator` inside). So, it's expected that image generated by Diffusers and C++ versions provide different images, because latent images are initialize differently.
+
 ## Download and convert the models and tokenizers
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
@@ -88,13 +92,6 @@ With adapter | Without adapter
 :---:|:---:
 ![](./lora.bmp) | ![](./baseline.bmp)
 
-
-## Note
-
-- Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample:
-
-C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method.
-
 ## Run text to image with multiple devices
 
 The `heterogeneous_stable_diffusion` sample demonstrates how a Text2ImagePipeline object can be created from individual subcomponents - scheduler, text encoder, unet, & vae decoder. This approach gives fine-grained control over the devices used to execute each stage of the stable diffusion pipeline.
diff --git a/samples/cpp/image_generation/lora_text2image.cpp b/samples/cpp/image_generation/lora_text2image.cpp
index 3fe4b74ff6..c1e6461db9 100644
--- a/samples/cpp/image_generation/lora_text2image.cpp
+++ b/samples/cpp/image_generation/lora_text2image.cpp
@@ -24,19 +24,19 @@ int32_t main(int32_t argc, char* argv[]) try {
 
     std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n";
     ov::Tensor image = pipe.generate(prompt,
-        ov::genai::generator(std::make_shared<ov::genai::CppStdGenerator>(42)),
         ov::genai::width(512),
         ov::genai::height(896),
-        ov::genai::num_inference_steps(20));
+        ov::genai::num_inference_steps(20),
+        ov::genai::rng_seed(42));
     imwrite("lora.bmp", image, true);
 
     std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n";
     image = pipe.generate(prompt,
         ov::genai::adapters(),  // passing adapters in generate overrides adapters set in the constructor; adapters() means no adapters
-        ov::genai::generator(std::make_shared<ov::genai::CppStdGenerator>(42)),
         ov::genai::width(512),
         ov::genai::height(896),
-        ov::genai::num_inference_steps(20));
+        ov::genai::num_inference_steps(20),
+        ov::genai::rng_seed(42));
     imwrite("baseline.bmp", image, true);
 
     return EXIT_SUCCESS;
diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 0ddf57d882..3e53f40fc4 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -20,6 +20,10 @@ Users can change the sample code and play with the following generation paramete
 - Apply multiple different LoRA adapters and mix them with different blending coefficients
 - (Image to image and inpainting) Play with `strength` parameter to control how initial image is noised and reduce number of inference steps
 
+> [!NOTE]  
+> OpenVINO GenAI is written in C++ and uses `CppStdGenerator` random generator in Image Generation pipelines, while Diffusers library uses `torch.Generator` underhood.
+> To have the same results with HuggingFace, pass manually created `torch.Generator(device='cpu').manual_seed(seed)` to Diffusers generation pipelines and `openvino_genai.TorchGenerator(seed)` to OpenVINO GenAI pipelines as value for `generator` kwarg.
+
 ## Download and convert the models and tokenizers
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
@@ -41,7 +45,7 @@ Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pi
 
 Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting`
 
-   ![](./text2image.bmp)
+   ![](./../../cpp/image_generation/512x512.bmp)
 
 ### Run with callback
 
@@ -85,7 +89,7 @@ Check the difference:
 
 With adapter | Without adapter
 :---:|:---:
-![](./lora.bmp) | ![](./baseline.bmp)
+![](./../../cpp/image_generation/lora.bmp) | ![](./../../cpp/image_generation/baseline.bmp)
 
 ## Run text to image with multiple devices
 
diff --git a/samples/python/image_generation/baseline.bmp b/samples/python/image_generation/baseline.bmp
deleted file mode 100644
index 1501f5960e..0000000000
--- a/samples/python/image_generation/baseline.bmp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ea0b60b64c4448448140a3cfb5e8609248ad35abd484ace1467d832e6966c941
-size 1376310
diff --git a/samples/python/image_generation/heterogeneous_stable_diffusion.py b/samples/python/image_generation/heterogeneous_stable_diffusion.py
index b1a2f9d5de..18f150816e 100644
--- a/samples/python/image_generation/heterogeneous_stable_diffusion.py
+++ b/samples/python/image_generation/heterogeneous_stable_diffusion.py
@@ -101,8 +101,7 @@ def main():
             height=height,
             guidance_scale=guidance_scale,
             num_inference_steps=number_of_inference_steps_per_image,
-            num_images_per_prompt=1,
-            generator=openvino_genai.CppStdGenerator(42)
+            num_images_per_prompt=1
         )
 
         image = Image.fromarray(image_tensor.data[0])
diff --git a/samples/python/image_generation/lora.bmp b/samples/python/image_generation/lora.bmp
deleted file mode 100644
index a0aaedb930..0000000000
--- a/samples/python/image_generation/lora.bmp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:804bb8d49f1702422abf57c300af75fe75acbef60a9cf8ad5cfc9262b7532c95
-size 1376310
diff --git a/samples/python/image_generation/lora_text2image.py b/samples/python/image_generation/lora_text2image.py
index 95e31ca0ea..6a46099dc2 100644
--- a/samples/python/image_generation/lora_text2image.py
+++ b/samples/python/image_generation/lora_text2image.py
@@ -6,20 +6,6 @@
 
 import openvino as ov
 import openvino_genai
-import numpy as np
-import sys
-
-
-class Generator(openvino_genai.Generator):
-    def __init__(self, seed, mu=0.0, sigma=1.0):
-        openvino_genai.Generator.__init__(self)
-        np.random.seed(seed)
-        self.mu = mu
-        self.sigma = sigma
-
-    def next(self):
-        return np.random.normal(self.mu, self.sigma)
-
 
 def image_write(path: str, image_tensor: ov.Tensor):
     from PIL import Image
@@ -46,23 +32,23 @@ def main():
 
     # LoRA adapters passed to the constructor will be activated by default in next generates
     pipe = openvino_genai.Text2ImagePipeline(args.models_path, device, adapters=adapter_config)
+
     print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp")
     image = pipe.generate(prompt,
-                          generator=Generator(42),
                           width=512,
                           height=896,
-                          num_inference_steps=20)
+                          num_inference_steps=20,
+                          rng_seed=42)
 
     image_write("lora.bmp", image)
     print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp")
     image = pipe.generate(prompt,
                           # passing adapters in generate overrides adapters set in the constructor; openvino_genai.AdapterConfig() means no adapters
                           adapters=openvino_genai.AdapterConfig(),
-                          generator=Generator(42),
                           width=512,
                           height=896,
-                          num_inference_steps=20
-                          )
+                          num_inference_steps=20,
+                          rng_seed=42)
     image_write("baseline.bmp", image)
 
 
diff --git a/samples/python/image_generation/text2image.bmp b/samples/python/image_generation/text2image.bmp
deleted file mode 100644
index 54974556a4..0000000000
--- a/samples/python/image_generation/text2image.bmp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c150896ec84f64d4f0cacd67f8f277e08d3ebb1c9a756d43fc80944db7a2ed4
-size 786486
diff --git a/samples/python/image_generation/text2image.py b/samples/python/image_generation/text2image.py
index 95d8c68e82..cba1eefd1d 100644
--- a/samples/python/image_generation/text2image.py
+++ b/samples/python/image_generation/text2image.py
@@ -6,17 +6,6 @@
 
 import openvino_genai
 from PIL import Image
-import numpy as np
-
-class Generator(openvino_genai.Generator):
-    def __init__(self, seed, mu=0.0, sigma=1.0):
-        openvino_genai.Generator.__init__(self)
-        np.random.seed(seed)
-        self.mu = mu
-        self.sigma = sigma
-
-    def next(self):
-        return np.random.normal(self.mu, self.sigma)
 
 
 def main():
@@ -33,9 +22,7 @@ def main():
         width=512,
         height=512,
         num_inference_steps=20,
-        num_images_per_prompt=1,
-        generator=Generator(42)  # openvino_genai.CppStdGenerator can be used to have same images as C++ sample
-    )
+        num_images_per_prompt=1)
 
     image = Image.fromarray(image_tensor.data[0])
     image.save("image.bmp")
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 2402f57fba..9d79240aa8 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -67,9 +67,9 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
  * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
  * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
- * @param presence_penalty reduces absolute log prob if the token was generated at least once. Ignored for non continuous batching.
- * @param frequency_penalty reduces absolute log prob as many times as the token was generated. Ignored for non continuous batching.
- * @param rng_seed initializes random generator. Ignored for non continuous batching.
+ * @param presence_penalty reduces absolute log prob if the token was generated at least once.
+ * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
+ * @param rng_seed initializes random generator.
  *
  * Speculative decoding parameters:
  * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of static strategy candidates number update.
@@ -174,7 +174,7 @@ static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};
 static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};
 static constexpr ov::Property<float> presence_penalty{"presence_penalty"};
 static constexpr ov::Property<float> frequency_penalty{"frequency_penalty"};
-static constexpr ov::Property<size_t> rng_seed{"rng_seed"};
+extern OPENVINO_GENAI_EXPORTS ov::Property<size_t> rng_seed;
 
 static constexpr ov::Property<float> assistant_confidence_threshold{"assistant_confidence_threshold"};
 static constexpr ov::Property<size_t> num_assistant_tokens{"num_assistant_tokens"};
diff --git a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
index 50e576466d..bd7073520a 100644
--- a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
@@ -39,6 +39,12 @@ class OPENVINO_GENAI_EXPORTS Generator {
      */
     virtual ov::Tensor randn_tensor(const ov::Shape& shape);
 
+    /**
+     * Sets a new initial seed value to random generator
+     * @param new_seed A new seed value
+     */
+    virtual void seed(size_t new_seed) = 0;
+
     /**
      * Default dtor defined to ensure working RTTI.
      */
@@ -58,9 +64,11 @@ class OPENVINO_GENAI_EXPORTS CppStdGenerator : public Generator {
 
     virtual float next() override;
 
+    virtual void seed(size_t new_seed) override;
+
 private:
-    std::mt19937 gen;
-    std::normal_distribution<float> normal;
+    std::mt19937 m_gen;
+    std::normal_distribution<float> m_normal;
 };
 
 /**
@@ -81,9 +89,17 @@ struct OPENVINO_GENAI_EXPORTS ImageGenerationConfig {
     size_t num_images_per_prompt = 1;
 
     /**
-     * Random generator to initial latents, add noise to initial images in case of image to image / inpainting pipelines
+     * Random generator to initialize latents, add noise to initial images in case of image to image / inpainting pipelines
+     * By default, random generator is initialized as `CppStdGenerator(generation_config.rng_seed)`
+     * @note If `generator` is specified, it has higher priority than `rng_seed` parameter.
+     */
+    std::shared_ptr<Generator> generator = nullptr;
+
+    /**
+     * Seed for random generator
+     * @note If `generator` is specified, it has higher priority than `rng_seed` parameter.
      */
-    std::shared_ptr<Generator> generator = std::make_shared<CppStdGenerator>(42);
+    size_t rng_seed = 42;
 
     float guidance_scale = 7.5f;
     int64_t height = -1;
@@ -91,7 +107,7 @@ struct OPENVINO_GENAI_EXPORTS ImageGenerationConfig {
     size_t num_inference_steps = 50;
 
     /**
-     * Max sequence lenght for T4 encoder / tokenizer used in SD3 / FLUX models
+     * Max sequence length for T5 encoder / tokenizer used in SD3 / FLUX models
      */
     int max_sequence_length = -1;
 
@@ -203,6 +219,12 @@ static constexpr ov::Property<float> strength{"strength"};
  */
 static constexpr ov::Property<std::shared_ptr<Generator>> generator{"generator"};
 
+/**
+ * Seed for random generator
+ * @note If `generator` is specified, it has higher priority than `rng_seed` parameter.
+ */
+extern OPENVINO_GENAI_EXPORTS ov::Property<size_t> rng_seed;
+
 /**
  * This parameters limits max sequence length for T5 encoder for SD3 and FLUX models.
  * T5 tokenizer output is padded with pad tokens to 'max_sequence_length' within a pipeline.
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 0829e8376a..189cfeded7 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -14,6 +14,8 @@
 namespace ov {
 namespace genai {
 
+ov::Property<size_t> rng_seed{"rng_seed"};
+
 GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) {
     using utils::read_json_param;
 
@@ -21,7 +23,7 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) {
     OPENVINO_ASSERT(f.is_open(), "Failed to open '", json_path, "' with generation config");
 
     nlohmann::json data = nlohmann::json::parse(f);
-    
+
     read_json_param(data, "max_new_tokens", max_new_tokens);
     read_json_param(data, "max_length", max_length);
     // note that ignore_eos is not present in HF GenerationConfig
@@ -103,6 +105,9 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) {
     read_anymap_param(config_map, "echo", echo);
     read_anymap_param(config_map, "logprobs", logprobs);
     read_anymap_param(config_map, "adapters", adapters);
+
+    // TODO: add support of 'generator' property similar to Image generation
+    read_anymap_param(config_map, "rng_seed", rng_seed);
 }
 
 size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const {
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index 716ba6b61b..e74cd441ce 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include <cassert>
-#include <ctime>
 
 #include "image_generation/diffusion_pipeline.hpp"
 #include "image_generation/numpy_utils.hpp"
diff --git a/src/cpp/src/image_generation/generation_config.cpp b/src/cpp/src/image_generation/generation_config.cpp
index 938034f628..ab098fabe5 100644
--- a/src/cpp/src/image_generation/generation_config.cpp
+++ b/src/cpp/src/image_generation/generation_config.cpp
@@ -27,11 +27,15 @@ ov::Tensor Generator::randn_tensor(const ov::Shape& shape) {
 }
 
 CppStdGenerator::CppStdGenerator(uint32_t seed)
-    : gen(seed), normal(0.0f, 1.0f) {
+    : m_gen(seed), m_normal(0.0f, 1.0f) {
 }
 
 float CppStdGenerator::next() {
-    return normal(gen);
+    return m_normal(m_gen);
+}
+
+void CppStdGenerator::seed(size_t new_seed) {
+    m_gen.seed(new_seed);
 }
 
 //
@@ -55,7 +59,6 @@ void ImageGenerationConfig::update_generation_config(const ov::AnyMap& propertie
     read_anymap_param(properties, "negative_prompt_2", negative_prompt_2);
     read_anymap_param(properties, "negative_prompt_3", negative_prompt_3);
     read_anymap_param(properties, "num_images_per_prompt", num_images_per_prompt);
-    read_anymap_param(properties, "generator", generator);
     read_anymap_param(properties, "guidance_scale", guidance_scale);
     read_anymap_param(properties, "height", height);
     read_anymap_param(properties, "width", width);
@@ -64,6 +67,25 @@ void ImageGenerationConfig::update_generation_config(const ov::AnyMap& propertie
     read_anymap_param(properties, "adapters", adapters);
     read_anymap_param(properties, "max_sequence_length", max_sequence_length);
 
+    // 'generator' has higher priority than 'seed' parameter
+    const bool have_generator_param = properties.find(ov::genai::generator.name()) != properties.end();
+    if (have_generator_param) {
+        read_anymap_param(properties, "generator", generator);
+    } else {
+        read_anymap_param(properties, "rng_seed", rng_seed);
+
+        // initialize random generator with a given seed value
+        if (!generator) {
+            generator = std::make_shared<CppStdGenerator>(rng_seed);
+        }
+
+        const bool have_rng_seed = properties.find(ov::genai::rng_seed.name()) != properties.end();
+        if (have_rng_seed) {
+            // we need to change seed as an user have specified it manually
+            generator->seed(rng_seed);
+        }
+    }
+
     validate();
 }
 
diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
index 18a3e0346f..e3e720109d 100644
--- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include <cassert>
-#include <ctime>
 
 #include "image_generation/diffusion_pipeline.hpp"
 #include "image_generation/numpy_utils.hpp"
@@ -453,11 +452,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
 
         check_inputs(generation_config, initial_image);
 
-        if (generation_config.generator == nullptr) {
-            uint32_t seed = time(NULL);
-            generation_config.generator = std::make_shared<CppStdGenerator>(seed);
-        }
-
         // 3. Prepare timesteps
         m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength);
         std::vector<float> timesteps = m_scheduler->get_float_timesteps();
diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
index 4afbd3ac78..7549b67919 100644
--- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include <ctime>
 #include <cassert>
 #include <filesystem>
 
@@ -333,11 +332,6 @@ class StableDiffusionPipeline : public DiffusionPipeline {
 
         set_lora_adapters(generation_config.adapters);
 
-        if (generation_config.generator == nullptr) {
-            uint32_t seed = time(NULL);
-            generation_config.generator = std::make_shared<CppStdGenerator>(seed);
-        }
-
         m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength);
         std::vector<std::int64_t> timesteps = m_scheduler->get_timesteps();
 
diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py
index ca7c2c0b32..470ddd0cd8 100644
--- a/src/python/openvino_genai/__init__.py
+++ b/src/python/openvino_genai/__init__.py
@@ -11,7 +11,6 @@
 if hasattr(os, "add_dll_directory"):
     os.add_dll_directory(os.path.dirname(__file__))
 
-
 from .py_openvino_genai import (
     DecodedResults,
     EncodedResults,
@@ -75,6 +74,7 @@
     ImageGenerationConfig,
     Generator,
     CppStdGenerator,
+    TorchGenerator,
 )
 
 # Continuous batching
diff --git a/src/python/openvino_genai/__init__.pyi b/src/python/openvino_genai/__init__.pyi
index 4d74e17588..187e0a0a06 100644
--- a/src/python/openvino_genai/__init__.pyi
+++ b/src/python/openvino_genai/__init__.pyi
@@ -34,6 +34,7 @@ from openvino_genai.py_openvino_genai import T5EncoderModel
 from openvino_genai.py_openvino_genai import Text2ImagePipeline
 from openvino_genai.py_openvino_genai import TokenizedInputs
 from openvino_genai.py_openvino_genai import Tokenizer
+from openvino_genai.py_openvino_genai import TorchGenerator
 from openvino_genai.py_openvino_genai import UNet2DConditionModel
 from openvino_genai.py_openvino_genai import VLMPipeline
 from openvino_genai.py_openvino_genai import WhisperGenerationConfig
@@ -43,5 +44,5 @@ from openvino_genai.py_openvino_genai import WhisperRawPerfMetrics
 from openvino_genai.py_openvino_genai import draft_model
 import os as os
 from . import py_openvino_genai
-__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai']
+__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai']
 __version__: str = '2025.0.0.0'
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 829d4844e8..8b8eb76b12 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -5,7 +5,7 @@ from __future__ import annotations
 import openvino._pyopenvino
 import os
 import typing
-__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model']
+__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model']
 class Adapter:
     """
     Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier.
@@ -398,6 +398,8 @@ class CppStdGenerator(Generator):
         ...
     def randn_tensor(self, shape: openvino._pyopenvino.Shape) -> openvino._pyopenvino.Tensor:
         ...
+    def seed(self, new_seed: int) -> None:
+        ...
 class DecodedResults:
     """
     
@@ -804,7 +806,8 @@ class Image2ImagePipeline:
             height: int - height of resulting images,
             width: int - width of resulting images,
             num_inference_steps: int - number of inference steps,
-            generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
+            rng_seed: int - a seed for random numbers generator,
+            generator: openvino_genai.TorchGenerator, openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
             adapters: LoRA adapters,
             strength: strength for image to image generation. 1.0f means initial image is fully noised,
             max_sequence_length: int - length of t5_encoder_model input
@@ -836,6 +839,7 @@ class ImageGenerationConfig:
     num_inference_steps: int
     prompt_2: str | None
     prompt_3: str | None
+    rng_seed: int
     strength: float
     width: int
     def __init__(self) -> None:
@@ -903,7 +907,8 @@ class InpaintingPipeline:
             height: int - height of resulting images,
             width: int - width of resulting images,
             num_inference_steps: int - number of inference steps,
-            generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
+            rng_seed: int - a seed for random numbers generator,
+            generator: openvino_genai.TorchGenerator, openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
             adapters: LoRA adapters,
             strength: strength for image to image generation. 1.0f means initial image is fully noised,
             max_sequence_length: int - length of t5_encoder_model input
@@ -1576,7 +1581,8 @@ class Text2ImagePipeline:
             height: int - height of resulting images,
             width: int - width of resulting images,
             num_inference_steps: int - number of inference steps,
-            generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
+            rng_seed: int - a seed for random numbers generator,
+            generator: openvino_genai.TorchGenerator, openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
             adapters: LoRA adapters,
             strength: strength for image to image generation. 1.0f means initial image is fully noised,
             max_sequence_length: int - length of t5_encoder_model input
@@ -1649,6 +1655,18 @@ class Tokenizer:
         """
         Override a chat_template read from tokenizer_config.json.
         """
+class TorchGenerator(CppStdGenerator):
+    """
+    This class provides OpenVINO GenAI Generator wrapper for torch.Generator
+    """
+    def __init__(self, seed: int) -> None:
+        ...
+    def next(self) -> float:
+        ...
+    def randn_tensor(self, shape: openvino._pyopenvino.Shape) -> openvino._pyopenvino.Tensor:
+        ...
+    def seed(self, new_seed: int) -> None:
+        ...
 class UNet2DConditionModel:
     """
     UNet2DConditionModel class.
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index 55be1708c1..da6ce6d21b 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -8,6 +8,7 @@
 #include <pybind11/stl_bind.h>
 #include <pybind11/stl/filesystem.h>
 #include <pybind11/functional.h>
+#include <pybind11/numpy.h>
 
 #include "openvino/genai/image_generation/text2image_pipeline.hpp"
 #include "openvino/genai/image_generation/image2image_pipeline.hpp"
@@ -19,23 +20,7 @@
 namespace py = pybind11;
 namespace pyutils = ov::genai::pybind::utils;
 
-namespace ov {
-namespace genai {
-
-/// Trampoline class to support inheritance from Generator in Python
-class PyGenerator : public ov::genai::Generator {
-public:
-    float next() override {
-        PYBIND11_OVERRIDE_PURE(float, Generator, next);
-    }
-
-    ov::Tensor randn_tensor(const ov::Shape& shape) override {
-        PYBIND11_OVERRIDE(ov::Tensor, Generator, randn_tensor, shape);
-    }
-};
-
-} // namespace genai
-} // namespace ov
+using namespace pybind11::literals;
 
 namespace {
 
@@ -59,7 +44,8 @@ auto text2image_generate_docstring = R"(
     height: int - height of resulting images,
     width: int - width of resulting images,
     num_inference_steps: int - number of inference steps,
-    generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
+    rng_seed: int - a seed for random numbers generator,
+    generator: openvino_genai.TorchGenerator, openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator,
     adapters: LoRA adapters,
     strength: strength for image to image generation. 1.0f means initial image is fully noised,
     max_sequence_length: int - length of t5_encoder_model input
@@ -68,7 +54,102 @@ auto text2image_generate_docstring = R"(
     :rtype: ov.Tensor
 )";
 
+// Trampoline class to support inheritance from Generator in Python
+class PyGenerator : public ov::genai::Generator {
+public:
+    float next() override {
+        PYBIND11_OVERRIDE_PURE(float, Generator, next);
+    }
+
+    ov::Tensor randn_tensor(const ov::Shape& shape) override {
+        PYBIND11_OVERRIDE(ov::Tensor, Generator, randn_tensor, shape);
+    }
+
+    void seed(size_t new_seed) override {
+        PYBIND11_OVERRIDE_PURE(void, Generator, seed, new_seed);
+    }
+};
+
+py::list to_py_list(const ov::Shape shape) {
+    py::list py_shape;
+    for (auto d : shape)
+        py_shape.append(d);
+
+    return py_shape;
+}
 
+class TorchGenerator : public ov::genai::CppStdGenerator {
+    py::module_ m_torch;
+    py::object m_torch_generator, m_float32;
+
+    void create_torch_generator(size_t seed) {
+        m_torch_generator = m_torch.attr("Generator")("device"_a="cpu").attr("manual_seed")(seed);
+    }
+public:
+    explicit TorchGenerator(uint32_t seed) : CppStdGenerator(seed) {
+        try {
+            m_torch = py::module_::import("torch");
+        } catch (const py::error_already_set& e) {
+            if (e.matches(PyExc_ModuleNotFoundError)) {
+                throw std::runtime_error("The 'torch' package is not installed. Please, call 'pip install torch' or use 'rng_seed' parameter.");
+            } else {
+                // Re-throw other exceptions
+                throw;
+            }
+        }
+
+        m_float32 = m_torch.attr("float32");
+        create_torch_generator(seed);
+    }
+
+    float next() override {
+        return m_torch.attr("randn")(1, "generator"_a=m_torch_generator, "dtype"_a=m_float32).attr("item")().cast<float>();
+    }
+
+    ov::Tensor randn_tensor(const ov::Shape& shape) override {
+        py::object torch_tensor = m_torch.attr("randn")(to_py_list(shape), "generator"_a=m_torch_generator, "dtype"_a=m_float32);
+        py::object numpy_tensor = torch_tensor.attr("numpy")();
+        py::array numpy_array = py::cast<py::array>(numpy_tensor);
+
+        if (!numpy_array.dtype().is(py::dtype::of<float>())) {
+            throw std::runtime_error("Expected a NumPy array with dtype float32");
+        }
+
+        class TorchTensorAllocator {
+            size_t m_total_size;
+            void * m_mutable_data;
+            py::object m_torch_tensor; // we need to hold torch.Tensor to avoid memory destruction
+
+        public:
+            TorchTensorAllocator(size_t total_size, void * mutable_data, py::object torch_tensor) :
+                m_total_size(total_size), m_mutable_data(mutable_data), m_torch_tensor(torch_tensor) { }
+
+            void* allocate(size_t bytes, size_t) const {
+                if (m_total_size == bytes) {
+                    return m_mutable_data;
+                }
+                throw std::runtime_error{"Unexpected number of bytes was requested to allocate."};
+            }
+
+            void deallocate(void*, size_t bytes, size_t) {
+                if (m_total_size != bytes) {
+                    throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."};
+                }
+            }
+
+            bool is_equal(const TorchTensorAllocator& other) const noexcept {
+                return this == &other;
+            }
+        };
+
+        return ov::Tensor(ov::element::f32, shape,
+            TorchTensorAllocator(ov::shape_size(shape) * ov::element::f32.size(), numpy_array.mutable_data(), torch_tensor));
+    }
+
+    void seed(size_t new_seed) override {
+        create_torch_generator(new_seed);
+    }
+};
 
 } // namespace
 
@@ -81,16 +162,24 @@ void init_flux_transformer_2d_model(py::module_& m);
 void init_autoencoder_kl(py::module_& m);
 
 void init_image_generation_pipelines(py::module_& m) {
-    py::class_<ov::genai::Generator, ov::genai::PyGenerator, std::shared_ptr<ov::genai::Generator>>(m, "Generator", "This class is used for storing pseudo-random generator.")
+    py::class_<ov::genai::Generator, ::PyGenerator, std::shared_ptr<ov::genai::Generator>>(m, "Generator", "This class is used for storing pseudo-random generator.")
         .def(py::init<>());
 
     py::class_<ov::genai::CppStdGenerator, ov::genai::Generator, std::shared_ptr<ov::genai::CppStdGenerator>>(m, "CppStdGenerator", "This class wraps std::mt19937 pseudo-random generator.")
         .def(py::init([](uint32_t seed) {
             return std::make_unique<ov::genai::CppStdGenerator>(seed);
-        }), 
-        py::arg("seed"))
+        }), py::arg("seed"))
         .def("next", &ov::genai::CppStdGenerator::next)
-        .def("randn_tensor", &ov::genai::CppStdGenerator::randn_tensor, py::arg("shape"));
+        .def("randn_tensor", &ov::genai::CppStdGenerator::randn_tensor, py::arg("shape"))
+        .def("seed", &ov::genai::CppStdGenerator::seed, py::arg("new_seed"));
+
+    py::class_<::TorchGenerator, ov::genai::CppStdGenerator, std::shared_ptr<::TorchGenerator>>(m, "TorchGenerator", "This class provides OpenVINO GenAI Generator wrapper for torch.Generator")
+        .def(py::init([](uint32_t seed) {
+            return std::make_unique<::TorchGenerator>(seed);
+        }), py::arg("seed"))
+        .def("next", &::TorchGenerator::next)
+        .def("randn_tensor", &::TorchGenerator::randn_tensor, py::arg("shape"))
+        .def("seed", &::TorchGenerator::seed, py::arg("new_seed"));
 
     // init image generation models
     init_clip_text_model(m);
@@ -122,6 +211,7 @@ void init_image_generation_pipelines(py::module_& m) {
         .def_readwrite("negative_prompt_2", &ov::genai::ImageGenerationConfig::negative_prompt_2)
         .def_readwrite("negative_prompt_3", &ov::genai::ImageGenerationConfig::negative_prompt_3)
         .def_readwrite("generator", &ov::genai::ImageGenerationConfig::generator)
+        .def_readwrite("rng_seed", &ov::genai::ImageGenerationConfig::rng_seed)
         .def_readwrite("guidance_scale", &ov::genai::ImageGenerationConfig::guidance_scale)
         .def_readwrite("height", &ov::genai::ImageGenerationConfig::height)
         .def_readwrite("width", &ov::genai::ImageGenerationConfig::width)
diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
index 1ff7ff5e21..0cced117e4 100644
--- a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
+++ b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
@@ -27,17 +27,6 @@
 }
 
 
-class Generator(openvino_genai.Generator):
-    def __init__(self, seed, rng, mu=0.0, sigma=1.0):
-        openvino_genai.Generator.__init__(self)
-        self.mu = mu
-        self.sigma = sigma
-        self.rng = rng
-
-    def next(self):
-        return torch.randn(1, generator=self.rng, dtype=torch.float32).item()
-
-
 @register_evaluator("text-to-image")
 class Text2ImageEvaluator(BaseEvaluator):
     def __init__(
@@ -171,7 +160,7 @@ def default_gen_image_fn(model, prompt, num_inference_steps, generator=None):
                 model,
                 prompt,
                 self.num_inference_steps,
-                generator=Generator(self.seed, rng) if self.is_genai else rng
+                generator=openvino_genai.TorchGenerator(self.seed) if self.is_genai else rng
             )
             image_path = os.path.join(image_dir, f"{i}.png")
             image.save(image_path)

From 2a52e869e16b47a06b17b3f21428207a61c1e8ea Mon Sep 17 00:00:00 2001
From: Pawel Raasz <pawel.raasz@intel.com>
Date: Mon, 16 Dec 2024 17:40:31 +0100
Subject: [PATCH 14/18] Fix test fails after PPP stops move tensor names
 (#1390)

###  Description:
- Fix the GENAI test after PPP don't move Node's name and tensor names.

### Related PRs:
- openvinotoolkit/openvino_tokenizers#352
- openvinotoolkit/openvino#28069

---------

Signed-off-by: Raasz, Pawel <pawel.raasz@intel.com>
---
 .github/workflows/linux.yml                    | 2 +-
 .github/workflows/llm_bench-python.yml         | 4 ++--
 .github/workflows/stable_diffusion_1_5_cpp.yml | 8 ++++----
 thirdparty/openvino_tokenizers                 | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 8d596aed56..0bb0c1af6e 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -52,7 +52,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+        revision: latest_available_commit
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 8b022f27e0..f87cd76126 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -34,7 +34,7 @@ jobs:
     runs-on: aks-linux-2-cores-8gb
     container:
       image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
-      volumes: 
+      volumes:
         - /mount:/mount
         - ${{ github.workspace }}:${{ github.workspace }}
 
@@ -44,7 +44,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+        revision: latest_available_commit
 
   build:
     defaults:
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 497bfbff3e..34c5a0f87e 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -35,7 +35,7 @@ jobs:
     runs-on: aks-linux-2-cores-8gb
     container:
       image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
-      volumes: 
+      volumes:
         - /mount:/mount
         - ${{ github.workspace }}:${{ github.workspace }}
 
@@ -45,7 +45,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+        revision: latest_available_commit
 
   openvino_download_windows:
     name: Download OpenVINO for Windows
@@ -61,7 +61,7 @@ jobs:
     runs-on: aks-linux-2-cores-8gb
     container:
       image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
-      volumes: 
+      volumes:
         - /mount:/mount
         - ${{ github.workspace }}:${{ github.workspace }}
 
@@ -71,7 +71,7 @@ jobs:
       with:
         platform: windows
         commit_packages_to_provide: wheels
-        revision: 747d0e7e105c9f2c9966a37861f95b1c7f886868
+        revision: latest_available_commit
 
   stable_diffusion_1_5_cpp-linux:
     runs-on: ubuntu-22.04-8-cores
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 1da0d2c705..bcfd3eda25 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 1da0d2c705016ad3f04c160ac9338f06505a07c1
+Subproject commit bcfd3eda25ae3ec423502a4074e35c774506c732

From a651292a803f184b9de957a44252c41f364d68ab Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Tue, 17 Dec 2024 04:47:36 +0100
Subject: [PATCH 15/18] Fix optimum-cli command for VLM example in README
 (#1348)

With the existing command users get an error: Channel size 4304 should
be divisible by size of group 128.

---------

Co-authored-by: Alexander Kozlov <alexander.kozlov@intel.com>
Co-authored-by: Nikita Savelyev <nikita.savelyev@intel.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 680bc9bc65..c2509528c3 100644
--- a/README.md
+++ b/README.md
@@ -107,12 +107,12 @@ For more examples check out our [Generative AI workflow](https://docs.openvino.a
 
 ### Converting and compressing the model from Hugging Face library
 
-```sh
-#(Basic) download and convert to OpenVINO MiniCPM-V-2_6 model
-optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --weight-format fp16 MiniCPM-V-2_6
+To convert the [OpenGVLab/InternVL2-1B](https://huggingface.co/OpenGVLab/InternVL2-1B) model, `timm` and `einops` are required: `pip install timm einops`.
 
-#(Recommended) Same as above but with compression: language model is compressed to int4, other model components are compressed to int8
-optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --weight-format int4 MiniCPM-V-2_6
+```sh
+# Download and convert the OpenGVLab/InternVL2-1B model to OpenVINO with int4 weight-compression for the language model
+# Other components are compressed to int8
+optimum-cli export openvino -m OpenGVLab/InternVL2-1B --trust-remote-code --weight-format int4 InternVL2-1B
 ```
 
 ### Run generation using VLMPipeline API in Python
@@ -132,7 +132,7 @@ import openvino_genai as ov_genai
 from PIL import Image
 
 # Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
-pipe = ov_genai.VLMPipeline("./MiniCPM-V-2_6/", "CPU")
+pipe = ov_genai.VLMPipeline("./InternVL2-1B", "CPU")
 
 image = Image.open("dog.jpg")
 image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)

From 1d4b1039a95c9f8817f412248656a83d463d3376 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 17 Dec 2024 10:48:02 +0400
Subject: [PATCH 16/18] [llm_bench] enable text2img callback only if supported
 (#1392)

CVS-159282
---
 tools/llm_bench/task/image_generation.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py
index f227898ef6..b870c7ec98 100644
--- a/tools/llm_bench/task/image_generation.py
+++ b/tools/llm_bench/task/image_generation.py
@@ -25,7 +25,7 @@
 stable_diffusion_hook = StableDiffusionHook()
 
 
-def collects_input_args(image_param, model_type, model_name):
+def collects_input_args(image_param, model_type, model_name, callback=None):
     input_args = {}
     input_args["width"] = image_param.get('width', DEFAULT_IMAGE_WIDTH)
     input_args["height"] = image_param.get('height', DEFAULT_IMAGE_HEIGHT)
@@ -37,6 +37,19 @@ def collects_input_args(image_param, model_type, model_name):
     else:
         if 'turbo' in model_name:
             input_args["guidance_scale"] = 0.0
+    if callback is not None:
+        from openvino import get_version
+        from packaging.version import parse
+
+        version = get_version()
+        # avoid invalid format
+        if "-" in version:
+            ov_major_version, dev_info = version.split("-", 1)
+            commit_id = dev_info.split("-")[0]
+            version = f"{ov_major_version}-{commit_id}"
+        is_callback_supported = parse(version) >= parse("2025.0.0")
+        if is_callback_supported:
+            input_args["callback"] = callback
 
     return input_args
 
@@ -107,7 +120,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list,
 def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
-    input_args = collects_input_args(image_param, args['model_type'], args['model_name'])
+    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], callback)
     out_str = f"Input params: Batch_size={args['batch_size']}, " \
               f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}"
     if 'guidance_scale' in input_args:
@@ -127,7 +140,7 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data
             llm_bench_utils.output_file.output_image_input_text(in_text, args, image_id, bs_idx, proc_id)
     callback.reset()
     start = time.perf_counter()
-    res = pipe.generate(input_text, **input_args, callback=callback).data
+    res = pipe.generate(input_text, **input_args).data
     end = time.perf_counter()
     callback.duration = end - start
     if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
@@ -157,7 +170,7 @@ def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,
         max_uss_mem=max_uss_mem_consumption,
-        stable_diffusion=callback,
+        stable_diffusion=callback if "callback" in input_args else None,
         prompt_idx=image_id
     )
     metrics_print.print_generated(num, warm_up=(num == 0), generated=rslt_img_fn, prompt_idx=image_id)

From f177ffc9799ef34a57b257e1811a60c68c167eb2 Mon Sep 17 00:00:00 2001
From: Anna Likholat <anna.likholat@intel.com>
Date: Tue, 17 Dec 2024 09:45:41 +0100
Subject: [PATCH 17/18] [ImageGeneration] PNDMScheduler support (#1393)

![image](https://github.com/user-attachments/assets/3ca9c44b-ec2e-49ae-afba-2e56d5bf51f7)

![image](https://github.com/user-attachments/assets/8999eac8-6acb-41ec-85f1-d6dab910aa44)

![image](https://github.com/user-attachments/assets/ee8e1461-5953-4c64-8c01-340cf6c3916b)

![image](https://github.com/user-attachments/assets/5ad73a32-ef2a-479d-b7ee-37543fd8d235)
---
 .../genai/image_generation/scheduler.hpp      |   3 +-
 .../src/image_generation/schedulers/pndm.cpp  | 277 ++++++++++++++++++
 .../src/image_generation/schedulers/pndm.hpp  |  67 +++++
 .../image_generation/schedulers/scheduler.cpp |   3 +
 .../src/image_generation/schedulers/types.cpp |   2 +
 src/docs/SUPPORTED_MODELS.md                  |  19 ++
 6 files changed, 370 insertions(+), 1 deletion(-)
 create mode 100644 src/cpp/src/image_generation/schedulers/pndm.cpp
 create mode 100644 src/cpp/src/image_generation/schedulers/pndm.hpp

diff --git a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
index 9b038ccd56..21c266aa50 100644
--- a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
@@ -18,7 +18,8 @@ class OPENVINO_GENAI_EXPORTS Scheduler {
         LMS_DISCRETE,
         DDIM,
         EULER_DISCRETE,
-        FLOW_MATCH_EULER_DISCRETE
+        FLOW_MATCH_EULER_DISCRETE,
+        PNDM
     };
 
     static std::shared_ptr<Scheduler> from_config(const std::filesystem::path& scheduler_config_path,
diff --git a/src/cpp/src/image_generation/schedulers/pndm.cpp b/src/cpp/src/image_generation/schedulers/pndm.cpp
new file mode 100644
index 0000000000..a760283b97
--- /dev/null
+++ b/src/cpp/src/image_generation/schedulers/pndm.cpp
@@ -0,0 +1,277 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cassert>
+#include <random>
+#include <fstream>
+#include <iterator>
+
+#include "image_generation/schedulers/pndm.hpp"
+#include "image_generation/numpy_utils.hpp"
+
+namespace ov {
+namespace genai {
+
+PNDMScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) {
+    std::ifstream file(scheduler_config_path);
+    OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path);
+
+    nlohmann::json data = nlohmann::json::parse(file);
+    using utils::read_json_param;
+
+    read_json_param(data, "num_train_timesteps", num_train_timesteps);
+    read_json_param(data, "beta_start", beta_start);
+    read_json_param(data, "beta_end", beta_end);
+    read_json_param(data, "beta_schedule", beta_schedule);
+    read_json_param(data, "trained_betas", trained_betas);
+    read_json_param(data, "set_alpha_to_one", set_alpha_to_one);
+    read_json_param(data, "skip_prk_steps", skip_prk_steps);
+    read_json_param(data, "steps_offset", steps_offset);
+    read_json_param(data, "prediction_type", prediction_type);
+    read_json_param(data, "timestep_spacing", timestep_spacing);
+}
+
+PNDMScheduler::PNDMScheduler(const std::filesystem::path& scheduler_config_path) 
+    : PNDMScheduler(Config(scheduler_config_path)) {
+}
+
+PNDMScheduler::PNDMScheduler(const Config& scheduler_config): m_config(scheduler_config) {
+
+    std::vector<float> alphas, betas;
+
+    using numpy_utils::linspace;
+
+    if (!m_config.trained_betas.empty()) {
+        betas = m_config.trained_betas;
+    } else if (m_config.beta_schedule == BetaSchedule::LINEAR) {
+        betas = linspace<float>(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps);
+    } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) {
+        float start = std::sqrt(m_config.beta_start);
+        float end = std::sqrt(m_config.beta_end);
+        betas = linspace<float>(start, end, m_config.num_train_timesteps);
+        std::for_each(betas.begin(), betas.end(), [] (float & x) { x *= x; });
+        // TODO: elif beta_schedule == "squaredcos_cap_v2":
+    } else {
+        OPENVINO_THROW("'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types");
+    }
+
+    std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [] (float b) { return 1.0f - b; });
+
+    for (size_t i = 1; i <= alphas.size(); i++) {
+        float alpha_cumprod =
+            std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{});
+        m_alphas_cumprod.push_back(alpha_cumprod);
+    }
+
+    m_final_alpha_cumprod = m_config.set_alpha_to_one ? 1 : m_alphas_cumprod[0];
+
+    // running values
+    m_ets = {};
+    m_counter = 0;
+
+    // setable values
+    m_num_inference_steps = -1;
+    m_prk_timesteps = {};
+    m_plms_timesteps = {};
+    m_timesteps = {};
+}
+
+void PNDMScheduler::set_timesteps(size_t num_inference_steps, float strength) {
+    m_timesteps.clear(), m_prk_timesteps.clear(), m_plms_timesteps.clear();
+
+    OPENVINO_ASSERT(num_inference_steps <= m_config.num_train_timesteps,
+                    "`num_inference_steps` cannot be larger than `m_config.num_train_timesteps`");
+
+    m_num_inference_steps = num_inference_steps;
+
+    switch (m_config.timestep_spacing) {
+        case TimestepSpacing::LINSPACE:
+        {
+            using numpy_utils::linspace;
+            float end = static_cast<float>(m_config.num_train_timesteps - 1);
+            auto linspaced = linspace<float>(0.0f, end, num_inference_steps, true);
+            for (float val : linspaced) {
+                m_timesteps.push_back(static_cast<int64_t>(std::round(val)));
+            }
+            break;
+        }
+        case TimestepSpacing::LEADING:
+        {
+            size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps;
+            for (size_t i = 0; i < m_num_inference_steps; ++i) {
+                m_timesteps.push_back(i * step_ratio + m_config.steps_offset);
+            }
+            break;
+        }
+        case TimestepSpacing::TRAILING:
+        {
+            float step_ratio = static_cast<float>(m_config.num_train_timesteps) / static_cast<float>(m_num_inference_steps);
+            for (float i = m_config.num_train_timesteps; i > 0; i-=step_ratio){
+                m_timesteps.push_back(static_cast<int64_t>(std::round(i)) - 1);
+            }
+            std::reverse(m_timesteps.begin(), m_timesteps.end());
+            break;
+        }
+        default:
+            OPENVINO_THROW("Unsupported value for 'timestep_spacing'. Please make sure to choose one of 'linspace', 'leading' or 'trailing'.");
+    }
+
+    if (m_config.skip_prk_steps) {
+        m_prk_timesteps = {};
+        std::copy(m_timesteps.begin(), m_timesteps.end() - 1, std::back_inserter(m_plms_timesteps));
+        m_plms_timesteps.push_back(m_timesteps[m_timesteps.size() - 2]);
+        m_plms_timesteps.push_back(m_timesteps[m_timesteps.size() - 1]);
+        std::reverse(m_plms_timesteps.begin(), m_plms_timesteps.end());
+    } else {
+        OPENVINO_THROW("'skip_prk_steps=false' case isn't supported. Please, add support.");
+    }
+
+    m_timesteps = m_prk_timesteps;
+    m_timesteps.insert(m_timesteps.end(), m_plms_timesteps.begin(), m_plms_timesteps.end());
+
+    m_ets = {};
+    m_counter = 0;
+    m_cur_sample = ov::Tensor(ov::element::f32, {});
+}
+
+std::map<std::string, ov::Tensor> PNDMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
+    // noise_pred - model_output
+    // latents - sample
+    // inference_step
+
+    if (m_counter < m_prk_timesteps.size() && !m_config.skip_prk_steps) {
+        OPENVINO_THROW("'skip_prk_steps=false' case isn't supported. Please, add support.");
+    } else {
+        return step_plms(noise_pred, latents, m_timesteps[inference_step]);
+    }
+}
+
+std::map<std::string, ov::Tensor> PNDMScheduler::step_plms(ov::Tensor model_output, ov::Tensor sample, size_t timestep) {
+    OPENVINO_ASSERT(m_num_inference_steps != -1,
+                    "Number of inference steps isn't set, you need to run `set_timesteps` after creating the scheduler");
+
+    int prev_timestep = timestep - m_config.num_train_timesteps / m_num_inference_steps;
+
+    if (m_counter != 1) {
+        if (m_ets.size() > 3) {
+            m_ets = std::vector<ov::Tensor>(m_ets.end() - 3, m_ets.end());
+        }
+        ov::Tensor ets_last(model_output.get_element_type(), model_output.get_shape());
+        model_output.copy_to(ets_last);
+        m_ets.push_back(ets_last);
+    } else {
+        prev_timestep = timestep;
+        timestep = timestep + m_config.num_train_timesteps / m_num_inference_steps;
+    }
+
+    float* model_output_data = model_output.data<float>();
+
+    size_t m_ets_size = m_ets.size();
+
+    if (m_ets_size == 1 && m_counter == 0) {
+        m_cur_sample = ov::Tensor(sample.get_element_type(), sample.get_shape());
+        sample.copy_to(m_cur_sample);
+    } else if (m_ets_size == 1 && m_counter == 1) {
+        const float* ets_data = m_ets[0].data<float>();
+        for (size_t i = 0; i < model_output.get_size(); ++i) {
+            model_output_data[i] = (model_output_data[i] + ets_data[i]) / 2.0f;
+        }
+        sample = ov::Tensor(m_cur_sample.get_element_type(), m_cur_sample.get_shape());
+        m_cur_sample.copy_to(sample);
+        m_cur_sample = ov::Tensor(ov::element::f32, {});
+    } else if (m_ets_size == 2) {
+        const float* ets_data_1 = m_ets[1].data<float>();
+        const float* ets_data_2 = m_ets[0].data<float>();
+        for (size_t i = 0; i < model_output.get_size(); ++i) {
+            model_output_data[i] = (3.0f * ets_data_1[i] - ets_data_2[i]) / 2.0f;
+        }
+    } else if (m_ets_size == 3) {
+        const float* ets_data_1 = m_ets[2].data<float>();
+        const float* ets_data_2 = m_ets[1].data<float>();
+        const float* ets_data_3 = m_ets[0].data<float>();
+        for (size_t i = 0; i < model_output.get_size(); ++i) {
+            model_output_data[i] = (23.0f * ets_data_1[i] - 16.0f * ets_data_2[i] + 5.0f * ets_data_3[i]) / 12.0f;
+        }
+    } else if (m_ets_size == 4) {
+        const float* ets_data_1 = m_ets[3].data<float>();
+        const float* ets_data_2 = m_ets[2].data<float>();
+        const float* ets_data_3 = m_ets[1].data<float>();
+        const float* ets_data_4 = m_ets[0].data<float>();
+
+        for (size_t i = 0; i < model_output.get_size(); ++i) {
+            model_output_data[i] = (1.0f / 24.0f)
+                                   * (55.0f * ets_data_1[i] - 59.0f * ets_data_2[i] + 37.0f * ets_data_3[i] - 9.0f * ets_data_4[i]);
+        }
+    } else {
+        OPENVINO_THROW("PNDMScheduler: Unsupported step_plms case.");
+    }
+
+    ov::Tensor prev_sample = get_prev_sample(sample, timestep, prev_timestep, model_output);
+    m_counter++;
+
+    std::map<std::string, ov::Tensor> result{{"latent", prev_sample}};
+    return result;
+}
+
+ov::Tensor PNDMScheduler::get_prev_sample(ov::Tensor sample, size_t timestep, int prev_timestep, ov::Tensor model_output) {
+    float alpha_prod_t = m_alphas_cumprod[timestep];
+    float alpha_prod_t_prev = (prev_timestep >= 0) ? m_alphas_cumprod[prev_timestep] : m_final_alpha_cumprod;
+    float beta_prod_t = 1 - alpha_prod_t;
+    float beta_prod_t_prev = 1 - alpha_prod_t_prev;
+
+    float sample_coeff = std::sqrt((alpha_prod_t_prev / alpha_prod_t));
+    float model_output_denom_coeff = alpha_prod_t * std::sqrt(beta_prod_t_prev) +
+                                     std::sqrt((alpha_prod_t * beta_prod_t * alpha_prod_t_prev));
+
+    float* model_output_data = model_output.data<float>();
+    float* sample_data = sample.data<float>();
+
+    switch (m_config.prediction_type) {
+        case PredictionType::EPSILON:
+            break;
+        case PredictionType::V_PREDICTION:
+            for (size_t i = 0; i < model_output.get_size(); ++i) {
+                model_output_data[i] = std::sqrt(alpha_prod_t) * model_output_data[i] + std::sqrt(beta_prod_t) * sample_data[i];
+            }
+            break;
+        default:
+            OPENVINO_THROW("Unsupported value for 'PredictionType'");
+    }
+
+    ov::Tensor prev_sample = ov::Tensor(model_output.get_element_type(), model_output.get_shape());
+    float* prev_sample_data = prev_sample.data<float>();
+
+    for (size_t i = 0; i < prev_sample.get_size(); ++i) {
+        prev_sample_data[i] = sample_coeff * sample_data[i] - (alpha_prod_t_prev - alpha_prod_t) * model_output_data[i] / model_output_denom_coeff;
+    }
+
+    return prev_sample;
+}
+
+void PNDMScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const {
+    float sqrt_alpha_prod = std::sqrt(m_alphas_cumprod[latent_timestep]);
+    float sqrt_one_minus_alpha_prod = std::sqrt(1.0 - m_alphas_cumprod[latent_timestep]);
+
+    float * init_latent_data = init_latent.data<float>();
+    const float * noise_data = noise.data<float>();
+
+    for (size_t i = 0; i < init_latent.get_size(); ++i) {
+        init_latent_data[i] = sqrt_alpha_prod * init_latent_data[i] + sqrt_one_minus_alpha_prod * noise_data[i];
+    }
+}
+
+std::vector<int64_t> PNDMScheduler::get_timesteps() const {
+    return m_timesteps;
+}
+
+void PNDMScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) {
+    return;
+}
+
+float PNDMScheduler::get_init_noise_sigma() const {
+    return 1.0f;
+}
+
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/pndm.hpp b/src/cpp/src/image_generation/schedulers/pndm.hpp
new file mode 100644
index 0000000000..4e346f58b3
--- /dev/null
+++ b/src/cpp/src/image_generation/schedulers/pndm.hpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <filesystem>
+#include <list>
+#include <string>
+
+#include "image_generation/schedulers/types.hpp"
+#include "image_generation/schedulers/ischeduler.hpp"
+
+namespace ov {
+namespace genai {
+
+class PNDMScheduler : public IScheduler {
+public:
+    struct Config {
+        int32_t num_train_timesteps = 1000;
+        float beta_start = 0.0001f, beta_end = 0.02f;
+        BetaSchedule beta_schedule = BetaSchedule::LINEAR;
+        std::vector<float> trained_betas = {};
+        bool set_alpha_to_one = false, skip_prk_steps = false;
+        PredictionType prediction_type = PredictionType::EPSILON;
+        TimestepSpacing timestep_spacing = TimestepSpacing::LEADING;
+        size_t steps_offset = 0;
+
+        Config() = default;
+        explicit Config(const std::filesystem::path& scheduler_config_path);
+    };
+
+    explicit PNDMScheduler(const std::filesystem::path& scheduler_config_path);
+    explicit PNDMScheduler(const Config& scheduler_config);
+
+    void set_timesteps(size_t num_inference_steps, float strength) override;
+
+    std::vector<std::int64_t> get_timesteps() const override;
+
+    float get_init_noise_sigma() const override;
+
+    void scale_model_input(ov::Tensor sample, size_t inference_step) override;
+
+    std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) override;
+
+    void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t timestep) const override;
+
+private:
+    Config m_config;
+
+    float m_final_alpha_cumprod;
+    size_t m_num_inference_steps;
+    size_t m_counter;
+
+    std::vector<float> m_alphas_cumprod;
+    std::vector<int64_t> m_timesteps;
+    std::vector<int64_t> m_prk_timesteps;
+    std::vector<int64_t> m_plms_timesteps;
+    std::vector<ov::Tensor> m_ets;
+
+    ov::Tensor m_cur_sample;
+
+    std::map<std::string, ov::Tensor> step_plms(ov::Tensor model_output, ov::Tensor sample, size_t timestep);
+    ov::Tensor get_prev_sample(ov::Tensor sample, size_t timestep, int prev_timestep, ov::Tensor model_output);
+};
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/scheduler.cpp b/src/cpp/src/image_generation/schedulers/scheduler.cpp
index 3a7556b6d9..f9cd098346 100644
--- a/src/cpp/src/image_generation/schedulers/scheduler.cpp
+++ b/src/cpp/src/image_generation/schedulers/scheduler.cpp
@@ -10,6 +10,7 @@
 #include "image_generation/schedulers/ddim.hpp"
 #include "image_generation/schedulers/euler_discrete.hpp"
 #include "image_generation/schedulers/flow_match_euler_discrete.hpp"
+#include "image_generation/schedulers/pndm.hpp"
 
 namespace ov {
 namespace genai {
@@ -38,6 +39,8 @@ std::shared_ptr<Scheduler> Scheduler::from_config(const std::filesystem::path& s
         scheduler = std::make_shared<EulerDiscreteScheduler>(scheduler_config_path);
     } else if (scheduler_type == Scheduler::Type::FLOW_MATCH_EULER_DISCRETE) {
         scheduler = std::make_shared<FlowMatchEulerDiscreteScheduler>(scheduler_config_path);
+    } else if (scheduler_type == Scheduler::Type::PNDM) {
+        scheduler = std::make_shared<PNDMScheduler>(scheduler_config_path);
     } else {
         OPENVINO_THROW("Unsupported scheduler type '", scheduler_type, ". Please, manually create scheduler via supported one");
     }
diff --git a/src/cpp/src/image_generation/schedulers/types.cpp b/src/cpp/src/image_generation/schedulers/types.cpp
index f7d21b12af..2f7c6d3f25 100644
--- a/src/cpp/src/image_generation/schedulers/types.cpp
+++ b/src/cpp/src/image_generation/schedulers/types.cpp
@@ -55,6 +55,8 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Schedu
             param = Scheduler::EULER_DISCRETE;
         else if (scheduler_type_str == "FlowMatchEulerDiscreteScheduler")
             param = Scheduler::FLOW_MATCH_EULER_DISCRETE;
+        else if (scheduler_type_str == "PNDMScheduler")
+            param = Scheduler::PNDM;
         else if (!scheduler_type_str.empty()) {
             OPENVINO_THROW("Unsupported value for 'scheduler' ", scheduler_type_str);
         }
diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index fe13e5848f..8c922ee644 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -183,10 +183,29 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <td>Supported</td>
       <td>
         <ul>
+          <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-1"><code>CompVis/stable-diffusion-v1-1</code></a></li>
+          <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-2"><code>CompVis/stable-diffusion-v1-2</code></a></li>
+          <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-3"><code>CompVis/stable-diffusion-v1-3</code></a></li>
+          <li><a href="https://huggingface.co/CompVis/stable-diffusion-v1-4"><code>CompVis/stable-diffusion-v1-4</code></a></li>
+          <li><a href="https://huggingface.co/junnyu/stable-diffusion-v1-4-paddle"><code>junnyu/stable-diffusion-v1-4-paddle</code></a></li>
+          <li><a href="https://huggingface.co/jcplus/stable-diffusion-v1-5"><code>jcplus/stable-diffusion-v1-5</code></a></li>
+          <li><a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5"><code>stable-diffusion-v1-5/stable-diffusion-v1-5</code></a></li>
           <li><a href="https://huggingface.co/botp/stable-diffusion-v1-5"><code>botp/stable-diffusion-v1-5</code></a></li>
           <li><a href="https://huggingface.co/dreamlike-art/dreamlike-anime-1.0"><code>dreamlike-art/dreamlike-anime-1.0</code></a></li>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2"><code>stabilityai/stable-diffusion-2</code></a></li>
+          <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2-base"><code>stabilityai/stable-diffusion-2-base</code></a></li>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2-1"><code>stabilityai/stable-diffusion-2-1</code></a></li>
+          <li><a href="https://huggingface.co/bguisard/stable-diffusion-nano-2-1"><code>bguisard/stable-diffusion-nano-2-1</code></a></li>
+          <li><a href="https://huggingface.co/justinpinkney/pokemon-stable-diffusion"><code>justinpinkney/pokemon-stable-diffusion</code></a></li>
+          <li><a href="https://huggingface.co/stablediffusionapi/architecture-tuned-model"><code>stablediffusionapi/architecture-tuned-model</code></a></li>
+          <li><a href="https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1"><code>IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1</code></a></li>
+          <li><a href="https://huggingface.co/ZeroCool94/stable-diffusion-v1-5"><code>ZeroCool94/stable-diffusion-v1-5</code></a></li>
+          <li><a href="https://huggingface.co/pcuenq/stable-diffusion-v1-4"><code>pcuenq/stable-diffusion-v1-4</code></a></li>
+          <li><a href="https://huggingface.co/rinna/japanese-stable-diffusion"><code>rinna/japanese-stable-diffusion</code></a></li>
+          <li><a href="https://huggingface.co/benjamin-paine/stable-diffusion-v1-5"><code>benjamin-paine/stable-diffusion-v1-5</code></a></li>
+          <li><a href="https://huggingface.co/philschmid/stable-diffusion-v1-4-endpoints"><code>philschmid/stable-diffusion-v1-4-endpoints</code></a></li>
+          <li><a href="https://huggingface.co/naclbit/trinart_stable_diffusion_v2"><code>naclbit/trinart_stable_diffusion_v2</code></a></li>
+          <li><a href="https://huggingface.co/Fictiverse/Stable_Diffusion_PaperCut_Model"><code>Fictiverse/Stable_Diffusion_PaperCut_Model</code></a></li>
         </ul>
       </td>
     </tr>

From 973b26b2b1fed25b878ea6108b4d7c5ae825dc12 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 17 Dec 2024 13:20:32 +0400
Subject: [PATCH 18/18] add VLM support in llm bench (#1318)

TO DO:

- [x] add test
- [x] check correctness of num_input_tokens after
https://github.com/openvinotoolkit/openvino.genai/pull/1317
- [x] move unsupported pipelines to optimum

---------

Co-authored-by: Andrei Kochin <andrei.kochin@intel.com>
---
 .github/workflows/llm_bench-python.yml        |   6 +-
 tools/llm_bench/benchmark.py                  |   2 +
 tools/llm_bench/doc/PROMPT.md                 |   7 +-
 .../llm_bench/llm_bench_utils/config_class.py |   6 +-
 .../llm_bench_utils/gen_output_data.py        |   2 +
 .../llm_bench_utils/hook_beam_search.py       |  26 +-
 .../llm_bench/llm_bench_utils/hook_common.py  |   2 +
 .../llm_bench_utils/hook_greedy_search.py     |  30 +-
 .../llm_bench_utils/metrics_print.py          |   2 +
 .../llm_bench/llm_bench_utils/model_utils.py  |  74 ++--
 tools/llm_bench/llm_bench_utils/ov_utils.py   |  81 ++++
 .../llm_bench_utils/parse_json_data.py        |  17 +
 .../task/visual_language_generation.py        | 366 ++++++++++++++++++
 13 files changed, 586 insertions(+), 35 deletions(-)
 create mode 100644 tools/llm_bench/task/visual_language_generation.py

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index f87cd76126..3d31649cea 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -137,12 +137,16 @@ jobs:
           optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny ./ov_models/whisper-tiny
           python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1 --optimum
           python ./tools/llm_bench/benchmark.py -m ./ov_models/whisper-tiny --media multilingual_librispeech/data/mls_polish/train/audio/3283_1447_000/3283_1447_000000.flac -d cpu -n 1
+      - name: Text InternVL2-1B on Linux
+        run: |
+          optimum-cli export openvino --model OpenGVLab/InternVL2-1B ./ov_models/internvl2-1B --task image-text-to-text --trust-remote-code
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/internvl2-1B --media https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --prompt "What is unusual on this image?" -ic 20 --optimum
       - name: WWB Tests
         run: |
           pip install git+https://github.com/huggingface/optimum-intel.git
           GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }}
           python -m pytest -v ${{ env.WWB_PATH }}/tests
-
   stateful:
     defaults:
       run:
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
index bd5a5716a7..5fa22497c1 100644
--- a/tools/llm_bench/benchmark.py
+++ b/tools/llm_bench/benchmark.py
@@ -12,6 +12,7 @@
 from llm_bench_utils.memory_profile import MemConsumption
 import llm_bench_utils.output_csv
 import llm_bench_utils.output_json
+import task.visual_language_generation as bench_vlm
 import task.text_generation as bench_text
 import task.image_generation as bench_image
 import task.super_resolution_generation as bench_ldm_sr
@@ -167,6 +168,7 @@ def get_argprser():
     'code_gen': bench_text.run_text_generation_benchmark,
     'ldm_super_resolution': bench_ldm_sr.run_ldm_super_resolution_benchmark,
     'speech2text': bench_speech.run_speech_2_txt_benchmark,
+    "vlm": bench_vlm.run_visual_language_generation_benchmark
 }
 
 
diff --git a/tools/llm_bench/doc/PROMPT.md b/tools/llm_bench/doc/PROMPT.md
index 4ee28d47fa..5418bf0bb5 100644
--- a/tools/llm_bench/doc/PROMPT.md
+++ b/tools/llm_bench/doc/PROMPT.md
@@ -36,4 +36,9 @@ Supported parameters that can be set are:
 * `timestamp` - timestamp for whisper (default true)
 Prompt file example：
 {"media": "./audio/intel_ad_90s_128kbps.mp3", "language": "<|en|>", "timestamp":false}
-{"media": "./audio/intel_ad_120s_128kbps.mp3", "language": "<|en|>", "timestamp":true}
\ No newline at end of file
+{"media": "./audio/intel_ad_120s_128kbps.mp3", "language": "<|en|>", "timestamp":true}
+
+## 5. Visual Language Models
+Supported parameters that can be set are:
+* `media` - imge file path
+* `prompt`- input text prompt
\ No newline at end of file
diff --git a/tools/llm_bench/llm_bench_utils/config_class.py b/tools/llm_bench/llm_bench_utils/config_class.py
index 12385d2879..7dd27b198b 100644
--- a/tools/llm_bench/llm_bench_utils/config_class.py
+++ b/tools/llm_bench/llm_bench_utils/config_class.py
@@ -8,7 +8,8 @@
     OVModelForCausalLM,
     OVModelForSeq2SeqLM,
     OVDiffusionPipeline,
-    OVModelForSpeechSeq2Seq
+    OVModelForSpeechSeq2Seq,
+    OVModelForVisualCausalLM
 )
 from llm_bench_utils.ov_model_classes import OVMPTModel, OVLDMSuperResolutionPipeline, OVChatGLMModel
 
@@ -36,6 +37,7 @@
     'chatglm3': OVModelForCausalLM,
     'chatglm': OVChatGLMModel,
     'whisper': OVModelForSpeechSeq2Seq,
+    "vlm": OVModelForVisualCausalLM,
 }
 
 PT_MODEL_CLASSES_MAPPING = {
@@ -51,6 +53,7 @@
 
 USE_CASES = {
     'image_gen': ['stable-diffusion-', 'ssd-', 'tiny-sd', 'small-sd', 'lcm-', 'sdxl', 'dreamlike', "flux"],
+    "vlm": ["llava", "llava-next", "qwen2-vl", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v"],
     'speech2text': ['whisper'],
     'image_cls': ['vit'],
     'code_gen': ['replit', 'codegen2', 'codegen', 'codet5', "stable-code"],
@@ -111,4 +114,5 @@
     'speech2text': 'whisper',
     'code_gen': 'decoder',
     'ldm_super_resolution': 'ldm_super_resolution',
+    "vlm": "vlm"
 }
diff --git a/tools/llm_bench/llm_bench_utils/gen_output_data.py b/tools/llm_bench/llm_bench_utils/gen_output_data.py
index 3b7c668c82..b65e7b5c8c 100644
--- a/tools/llm_bench/llm_bench_utils/gen_output_data.py
+++ b/tools/llm_bench/llm_bench_utils/gen_output_data.py
@@ -16,6 +16,7 @@ def gen_iterate_data(
     max_uss_mem='',
     prompt_idx='',
     tokenization_time=[],
+    mm_embeddings_preparation_time=''
 ):
     iter_data = {}
     iter_data['iteration'] = iter_idx
@@ -35,4 +36,5 @@ def gen_iterate_data(
     iter_data['prompt_idx'] = prompt_idx
     iter_data['tokenization_time'] = tokenization_time[0] if len(tokenization_time) > 0 else ''
     iter_data['detokenization_time'] = tokenization_time[1] if len(tokenization_time) > 1 else ''
+    iter_data["mm_embeddings_preparation_time"] = mm_embeddings_preparation_time
     return iter_data
diff --git a/tools/llm_bench/llm_bench_utils/hook_beam_search.py b/tools/llm_bench/llm_bench_utils/hook_beam_search.py
index 99b0a9e5c3..d933acc3a5 100644
--- a/tools/llm_bench/llm_bench_utils/hook_beam_search.py
+++ b/tools/llm_bench/llm_bench_utils/hook_beam_search.py
@@ -5,6 +5,7 @@
 import time
 import torch
 import warnings
+import types
 import logging as log
 from torch import nn
 from typing import Optional, Tuple, Union, List
@@ -54,6 +55,7 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
 
 tm_list = []
 tm_infer_list = []
+tm_mm_embeddings = []
 
 
 # Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99
@@ -455,6 +457,15 @@ def new_beam_search(
         else:
             return sequence_outputs["sequences"]
 
+def new_get_multimodal_embeddings(
+        self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+
+    start = time.perf_counter()
+    result = self._orig_get_multimodal_embeddings(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, position_ids=position_ids, **kwargs)
+    end = time.perf_counter()
+    tm_mm_embeddings.append(end - start)
+    return result
 
 class BeamSearchHook:
     def __init__(self):
@@ -483,6 +494,19 @@ def get_time_infer_list(self):
         global tm_infer_list
         return tm_infer_list
 
+    def get_mm_embeddings_time_list(self):
+        global tm_mm_embeddings
+        return tm_mm_embeddings
+
+    def clear_mm_embeddins_time_list(self):
+        """Clear the infer time list."""
+        global tm_mm_embeddings
+        tm_mm_embeddings.clear()
+
     def new_forward(self, model):
         """Define a new beam search function."""
-        model._beam_search = new_beam_search.__get__(model, model.__class__)
\ No newline at end of file
+        model._beam_search = new_beam_search.__get__(model, model.__class__)
+
+    def new_get_multimodal_embeddings(self, model):
+        model._orig_get_multimodal_embeddings = model.get_multimodal_embeddings
+        model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model)
\ No newline at end of file
diff --git a/tools/llm_bench/llm_bench_utils/hook_common.py b/tools/llm_bench/llm_bench_utils/hook_common.py
index 4751ed7d4d..c805680cee 100644
--- a/tools/llm_bench/llm_bench_utils/hook_common.py
+++ b/tools/llm_bench/llm_bench_utils/hook_common.py
@@ -21,6 +21,8 @@ def get_bench_hook(num_beams, ov_model):
         else:
             bench_hook = llm_bench_utils.hook_greedy_search.GreedySearchHook()
         bench_hook.new_forward(ov_model)
+        if hasattr(ov_model, "get_multimodal_embeddings"):
+            bench_hook.new_get_multimodal_embeddings(ov_model)
     else:
         log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}')
         bench_hook = None
diff --git a/tools/llm_bench/llm_bench_utils/hook_greedy_search.py b/tools/llm_bench/llm_bench_utils/hook_greedy_search.py
index 03bbd55ea4..9039a99e69 100644
--- a/tools/llm_bench/llm_bench_utils/hook_greedy_search.py
+++ b/tools/llm_bench/llm_bench_utils/hook_greedy_search.py
@@ -4,6 +4,7 @@
 # flake8: noqa
 import time
 import torch
+import types
 import warnings
 import logging as log
 import transformers
@@ -50,7 +51,7 @@ class GenerateEncoderDecoderOutput(ModelOutput):
 
 tm_list = []
 tm_infer_list = []
-
+tm_mm_embeddings = []
 # Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99
 # Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2310
 # Add the function of collecting latency
@@ -328,6 +329,17 @@ def new_greedy_search(
             return input_ids
 
 
+def new_get_multimodal_embeddings(
+        self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+
+    start = time.perf_counter()
+    result = self._orig_get_multimodal_embeddings(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, position_ids=position_ids, **kwargs)
+    end = time.perf_counter()
+    tm_mm_embeddings.append(end - start)
+    return result
+
+
 class GreedySearchHook:
     def __init__(self):
         """Clear the time list."""
@@ -355,6 +367,16 @@ def get_time_infer_list(self):
         global tm_infer_list
         return tm_infer_list
 
+
+    def get_mm_embeddings_time_list(self):
+        global tm_mm_embeddings
+        return tm_mm_embeddings
+
+    def clear_mm_embeddins_time_list(self):
+        """Clear the infer time list."""
+        global tm_mm_embeddings
+        tm_mm_embeddings.clear()
+
     def new_forward(self, model):
         """Define a new greedy search function."""
         model._greedy_search = new_greedy_search.__get__(model, model.__class__)
@@ -363,4 +385,8 @@ def new_forward(self, model):
         if trans_version >= version.parse('4.45.0'):
             model._sample = hook_sample_v45.new_sample.__get__(model, model.__class__)
         elif trans_version >= version.parse('4.43.0'):
-            model._sample = hook_sample_v43.new_sample.__get__(model, model.__class__)         
+            model._sample = hook_sample_v43.new_sample.__get__(model, model.__class__)  
+       
+    def new_get_multimodal_embeddings(self, model):
+        model._orig_get_multimodal_embeddings = model.get_multimodal_embeddings
+        model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model)
\ No newline at end of file
diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py
index 73e83dc672..740d3b9bcc 100644
--- a/tools/llm_bench/llm_bench_utils/metrics_print.py
+++ b/tools/llm_bench/llm_bench_utils/metrics_print.py
@@ -26,6 +26,8 @@ def print_metrics(
         output_str += 'Tokenization Time: {:.2f}ms, '.format(tokenization_time[0])
         if len(tokenization_time) > 1:
             output_str += 'Detokenization Time: {:.2f}ms, '.format(tokenization_time[1])
+    if iter_data['mm_embeddings_preparation_time'] != '':
+        output_str += ' Multimodal Embeddings Preparation Time: {:.2f}ms, '.format(iter_data['mm_embeddings_preparation_time'])
     if iter_data['generation_time'] != '':
         output_str += 'Generation Time: {:.2f}s, '.format(iter_data['generation_time'])
     if iter_data['latency'] != '':
diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
index f72557b6c5..f3e7d21777 100644
--- a/tools/llm_bench/llm_bench_utils/model_utils.py
+++ b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -13,38 +13,54 @@
 def get_param_from_file(args, input_key):
     is_json_data = False
     data_list = []
-    if args[input_key] is None and args['prompt_file'] is None:
-        if args['use_case'] == 'text_gen':
-            data_list.append('What is OpenVINO?')
-        elif args['use_case'] == 'code_gen':
-            data_list.append('def print_hello_world():')
-        elif args['use_case'] == 'image_gen':
-            data_list.append('sailing ship in storm by Leonardo da Vinci')
-        else:
-            raise RuntimeError(f'== {input_key} and prompt file is empty ==')
-    elif args[input_key] is not None and args['prompt_file'] is not None:
-        raise RuntimeError(f'== {input_key} and prompt file should not exist together ==')
-    else:
-        if args[input_key] is not None:
-            if args[input_key] != '':
-                data_list.append(args[input_key])
+    if args['prompt_file'] is None:
+        if not isinstance(input_key, (list, tuple)):
+            if args[input_key] is None:
+                if args['use_case'] == 'text_gen':
+                    data_list.append('What is OpenVINO?')
+                elif args['use_case'] == 'code_gen':
+                    data_list.append('def print_hello_world():')
+                elif args['use_case'] == 'image_gen':
+                    data_list.append('sailing ship in storm by Leonardo da Vinci')
+                else:
+                    raise RuntimeError(f'== {input_key} and prompt file is empty ==')
+
+            elif args[input_key] is not None and args['prompt_file'] is not None:
+                raise RuntimeError(f'== {input_key} and prompt file should not exist together ==')
             else:
-                raise RuntimeError(f'== {input_key} path should not be empty string ==')
-        else:
-            input_prompt_list = args['prompt_file']
-            is_json_data = True
-            for input_prompt in input_prompt_list:
-                if input_prompt.endswith('.jsonl'):
-                    if os.path.exists(input_prompt):
-                        log.info(f'Read prompts from {input_prompt}')
-                        with open(input_prompt, 'r', encoding='utf-8') as f:
-                            for line in f:
-                                data = json.loads(line)
-                                data_list.append(data)
+                if args[input_key] is not None:
+                    if args[input_key] != '':
+                        data_list.append(args[input_key])
                     else:
-                        raise RuntimeError(f'== The prompt file:{input_prompt} does not exist ==')
+                        raise RuntimeError(f'== {input_key} path should not be empty string ==')
+        else:
+            if args["use_case"] != "vlm":
+                raise RuntimeError("Multiple sources for benchmarking supported only for Visual Language Models")
+            data_dict = {}
+            if args["media"] is None:
+                log.warn("Input image is not provided. Only text generation part will be evaluated")
+            else:
+                data_dict["media"] = args["media"]
+            if args["prompt"] is None:
+                data_dict["prompt"] = "What is OpenVINO?" if args["media"] is None else "Describe image"
+            else:
+                data_dict["prompt"] = args["prompt"]
+            data_list.append(data_dict)
+    else:
+        input_prompt_list = args['prompt_file']
+        is_json_data = True
+        for input_prompt in input_prompt_list:
+            if input_prompt.endswith('.jsonl'):
+                if os.path.exists(input_prompt):
+                    log.info(f'Read prompts from {input_prompt}')
+                    with open(input_prompt, 'r', encoding='utf-8') as f:
+                        for line in f:
+                            data = json.loads(line)
+                            data_list.append(data)
                 else:
-                    raise RuntimeError(f'== The prompt file:{input_prompt} should be ended with .jsonl ==')
+                    raise RuntimeError(f'== The prompt file:{input_prompt} does not exist ==')
+            else:
+                raise RuntimeError(f'== The prompt file:{input_prompt} should be ended with .jsonl ==')
     return data_list, is_json_data
 
 
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index 8a28fbe355..427f1c84f3 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -23,6 +23,8 @@
 import queue
 from transformers.generation.streamers import BaseStreamer
 
+GENAI_SUPPORTED_VLM = ["llava", "llava-next", "internvl-chat", "minicpmv"]
+
 
 def generate_simplified(self, *args, **kwargs):
     if len(args):
@@ -523,6 +525,85 @@ def create_speech_2txt_model(model_path, device, **kwargs):
     return pipe, processor, from_pretrained_time, False
 
 
+def get_vlm_processor(model_path):
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    model_type = config.model_type
+    if model_type == "llava-qwen2":
+        processor = AutoProcessor.from_pretrained(config.mm_vision_tower, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        preprocessors = {"processor": processor, "tokenizer": tokenizer}
+    elif model_type == "internvl_chat":
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        preprocessors = {"processor": None, "tokenizer": tokenizer, "config": config}
+    else:
+        processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+        preprocessors = {"processor": processor, "tokenizer": processor}
+    return preprocessors
+
+
+def create_genai_image_text_gen_model(model_path, device, ov_config, **kwargs):
+    import openvino_genai
+
+    if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists():
+        convert_ov_tokenizer(model_path)
+
+    processor_config = get_vlm_processor(model_path)
+
+    start = time.perf_counter()
+    llm_pipe = openvino_genai.VLMPipeline(model_path, device.upper(), **ov_config)
+    end = time.perf_counter()
+    log.info(f'Pipeline initialization time: {end - start:.2f}s')
+
+    return llm_pipe, processor_config, end - start, None, True
+
+
+def create_image_text_gen_model(model_path, device, **kwargs):
+    model_path = Path(model_path)
+    # specify the model path
+    if model_path.name.endswith('xml'):
+        model_path = model_path.parents[2]
+
+    ov_config = kwargs['config']
+
+    model_path_existed = Path(model_path).exists()
+    # load model
+    if not model_path_existed:
+        raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
+    else:
+        remote_code = False
+        try:
+            model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=False)
+        except Exception:
+            model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+            remote_code = True
+        if kwargs.get("genai", True) and is_genai_available(log_msg=True):
+            if model_config.model_type.replace("_", "-") in GENAI_SUPPORTED_VLM:
+                log.info("Selected OpenVINO GenAI for benchmarking")
+                return create_genai_image_text_gen_model(model_path, device, ov_config, **kwargs)
+            else:
+                log.warning(
+                    f"Model type `{model_config.model_type}` is not supported by OpenVINO GenAI. "
+                    "Benchmark will be switched to Optimum Intel pipeline realization"
+                )
+
+        log.info("Selected Optimum Intel for benchmarking")
+        model_class = OV_MODEL_CLASSES_MAPPING.get(DEFAULT_MODEL_CLASSES[kwargs['use_case']])
+        start = time.perf_counter()
+        ov_model = model_class.from_pretrained(
+            model_path,
+            device=device,
+            ov_config=ov_config,
+            config=model_config,
+            trust_remote_code=remote_code
+        )
+        end = time.perf_counter()
+    bench_hook = get_bench_hook(kwargs['num_beams'], ov_model)
+    from_pretrained_time = end - start
+    log.info(f'From pretrained time: {from_pretrained_time:.2f}s')
+    processor_config = get_vlm_processor(model_path)
+    return ov_model, processor_config, from_pretrained_time, bench_hook, False
+
+
 def is_genai_available(log_msg=False):
     import importlib
     try:
diff --git a/tools/llm_bench/llm_bench_utils/parse_json_data.py b/tools/llm_bench/llm_bench_utils/parse_json_data.py
index 6e2978d9d6..28fbd298cd 100644
--- a/tools/llm_bench/llm_bench_utils/parse_json_data.py
+++ b/tools/llm_bench/llm_bench_utils/parse_json_data.py
@@ -16,6 +16,23 @@ def parse_text_json_data(json_data_list):
     return text_param_list
 
 
+def parse_vlm_json_data(json_data_list):
+    text_param_list = []
+    for json_data in json_data_list:
+        prompt_data = {}
+        if 'prompt' in json_data:
+            if json_data['prompt'] != '':
+                prompt_data["prompt"] = json_data['prompt']
+            else:
+                raise RuntimeError('== prompt should not be empty string ==')
+        else:
+            raise RuntimeError('== key word "prompt" does not exist ==')
+        if "media" in json_data_list:
+            prompt_data["media"] = json_data["media"]
+        text_param_list.append(prompt_data)
+    return text_param_list
+
+
 def parse_image_json_data(json_data_list):
     image_param_list = []
     for data in json_data_list:
diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py
new file mode 100644
index 0000000000..c4144366b4
--- /dev/null
+++ b/tools/llm_bench/task/visual_language_generation.py
@@ -0,0 +1,366 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+import os
+import time
+import datetime
+from pathlib import Path
+import logging as log
+import llm_bench_utils.ov_utils
+import llm_bench_utils.pt_utils
+import llm_bench_utils.model_utils as model_utils
+import numpy as np
+import openvino as ov
+import hashlib
+import llm_bench_utils.metrics_print as metrics_print
+import llm_bench_utils.output_csv
+from transformers import set_seed
+from transformers.image_utils import load_image
+import llm_bench_utils.output_json
+import llm_bench_utils.output_file
+import llm_bench_utils.gen_output_data as gen_output_data
+import llm_bench_utils.parse_json_data as parse_json_data
+
+FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils}
+
+DEFAULT_OUTPUT_TOKEN_SIZE = 512
+
+
+def run_visual_language_generation_optimum(
+    inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, bench_hook, model_precision, proc_id, mem_consumption
+):
+    set_seed(args['seed'])
+    if args['batch_size'] != 1:
+        log.warning("Only batch size 1 available for benchmarking")
+        args["batch_size"] = 1
+    images = []
+    prompts = []
+    for input_data in inputs:
+        if "media" in input_data:
+            images.append(load_image(input_data["media"]))
+        prompts.append(input_data["prompt"])
+
+    if args["output_dir"] is not None and num == 0:
+        for bs_index, in_text in enumerate(prompts):
+            llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
+    tok_encode_start = time.perf_counter()
+    input_data = model.preprocess_inputs(text=prompts[0], image=images[0], **processor)
+    tok_encode_end = time.perf_counter()
+    tok_encode_time = (tok_encode_end - tok_encode_start) * 1000
+    # Remove `token_type_ids` from inputs
+    input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data
+    input_token_size = input_tokens[0].numel()
+    if args['batch_size'] > 1:
+        out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
+        out_str += " Batch_size={}, ".format(args['batch_size'])
+        out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size'])
+        if args['infer_count'] is not None:
+            out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size'])
+        log.info(out_str)
+
+    max_rss_mem_consumption = ''
+    max_uss_mem_consumption = ''
+    max_shared_mem_consumption = ''
+    if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
+        mem_consumption.start_collect_memory_consumption()
+    max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
+    start = time.perf_counter()
+    if args['infer_count'] is not None and args['end_token_stopping'] is False:
+        model.generation_config.eos_token_id = None
+        model.config.eos_token_id = None
+        result = model.generate(
+            **input_data,
+            max_new_tokens=int(max_gen_tokens),
+            num_beams=args['num_beams'],
+            use_cache=True,
+            eos_token_id=None,
+            do_sample=False
+        )
+    else:
+        result = model.generate(
+            **input_data,
+            max_new_tokens=int(max_gen_tokens),
+            num_beams=args['num_beams'],
+            use_cache=True,
+            do_sample=False
+        )
+    end = time.perf_counter()
+    if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
+        mem_consumption.end_collect_momory_consumption()
+        max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption()
+        mem_consumption.clear_max_memory_consumption()
+
+    generation_time = end - start
+    tok_decode_start = time.perf_counter()
+    generated_text = processor["tokenizer"].batch_decode(result[:, input_data["input_ids"].shape[1]:], skip_special_tokens=True)
+    tok_decode_end = time.perf_counter()
+    tok_decode_time = (tok_decode_end - tok_decode_start) * 1000
+    # Only text_gen need to minus length of input_data, because generated_text may include input_text
+    num_tokens = 0
+    result_md5_list = []
+    for bs_idx in range(args['batch_size']):
+        generated_token_size = len(result[bs_idx]) - input_data["input_ids"][bs_idx].numel()
+        num_tokens += generated_token_size
+        if generated_token_size > max_gen_tokens:
+            log.error('Output token size is over max output token size!')
+        result_text = generated_text[bs_idx]
+        if args["output_dir"] is not None:
+            llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id)
+        result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest())
+    if len(md5_list[num]) == 0:
+        md5_list[num] = {prompt_index : result_md5_list}
+    else:
+        md5_list[num][prompt_index] = result_md5_list
+    per_token_time = ""
+    if num_tokens > 0:
+        per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
+    else:
+        log.warning("No generated tokens")
+    tm_list = []
+    tm_infer_list = []
+    tm_mm_embeddings = ""
+    if bench_hook is not None:
+        tm_list = bench_hook.get_time_list()
+        tm_mm_embeddings = np.mean(bench_hook.get_mm_embeddings_time_list()) * 1000 * 1000
+        log.debug('latency of all tokens:')
+        [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
+        tm_infer_list = bench_hook.get_time_infer_list()
+        log.debug('latency of all infers:')
+        [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_infer_list)]
+        if args['num_beams'] == 1 and generated_token_size != len(tm_infer_list):
+            log.warning(f'Output token size({generated_token_size}) is not equal to infer count({len(tm_infer_list)})')
+    iter_data = gen_output_data.gen_iterate_data(
+        iter_idx=num,
+        in_size=input_token_size * args['batch_size'],
+        infer_count=len(tm_infer_list),
+        out_size=num_tokens,
+        gen_time=generation_time,
+        latency=per_token_time,
+        res_md5=result_md5_list,
+        max_rss_mem=max_rss_mem_consumption,
+        max_shared_mem=max_shared_mem_consumption,
+        max_uss_mem=max_uss_mem_consumption,
+        prompt_idx=prompt_index,
+        tokenization_time=(tok_encode_time, tok_decode_time),
+        mm_embeddings_preparation_time=tm_mm_embeddings
+    )
+    iter_data_list.append(iter_data)
+    metrics_print.print_metrics(
+        num,
+        iter_data,
+        tm_list,
+        tm_infer_list,
+        warm_up=(num == 0),
+        max_rss_mem=max_rss_mem_consumption,
+        max_shared_mem=max_shared_mem_consumption,
+        max_uss_mem=max_uss_mem_consumption,
+        tokenization_time=(tok_encode_time, tok_decode_time),
+        batch_size=args['batch_size'],
+        prompt_idx=prompt_index
+    )
+    if num > 0:
+        prev_md5 = md5_list[num - 1][prompt_index]
+        if result_md5_list != prev_md5:
+            log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
+                        f"is different from md5 of the {num - 1} iteration {prev_md5}")
+            metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
+            if not args.get("use_cb", False):
+                if num == 1:
+                    # if the device is CPU, throw exception
+                    if args['devices'].lower().startswith('cpu') is True:
+                        assert (result_md5_list == prev_md5)
+                else:
+                    # throw exception
+                    assert (result_md5_list == prev_md5)
+    else:
+        metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
+    if bench_hook is not None:
+        bench_hook.clear_time_list()
+        bench_hook.clear_time_infer_list()
+        bench_hook.clear_mm_embeddins_time_list()
+
+
+def load_image_genai(image_path):
+    pil_image = load_image(image_path)
+    image_data = np.array(pil_image.getdata()).reshape(1, pil_image.size[1], pil_image.size[0], 3).astype(np.uint8)
+    return ov.Tensor(image_data)
+
+
+def run_visual_language_generation_genai(
+    inputs, num, model, processor, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id, mem_consumption
+):
+    if args['batch_size'] != 1:
+        log.warning("Only batch size 1 available for benchmarking")
+        args["batch_size"] = 1
+    images = []
+    prompts = []
+    for input_data in inputs:
+        if "media" in input_data:
+            images.append(load_image_genai(input_data["media"]))
+        prompts.append(input_data["prompt"])
+    if args["output_dir"] is not None and num == 0:
+        for bs_index, in_text in enumerate(prompts):
+            llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
+    max_rss_mem_consumption = ''
+    max_uss_mem_consumption = ''
+    max_shared_mem_consumption = ''
+    if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
+        mem_consumption.start_collect_memory_consumption()
+    max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
+    gen_config = model.get_generation_config()
+    gen_config.max_new_tokens = max_gen_tokens
+    gen_config.num_beams = args["num_beams"]
+    gen_config.do_sample = False
+    start = time.perf_counter()
+    generation_result = model.generate(prompts[0], images=images[0], generation_config=gen_config)
+    end = time.perf_counter()
+    generated_text = generation_result.texts
+    perf_metrics = generation_result.perf_metrics
+    if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
+        mem_consumption.end_collect_momory_consumption()
+        max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption()
+        mem_consumption.clear_max_memory_consumption()
+
+    generation_time = end - start
+    result_md5_list = []
+    generated_text_len = perf_metrics.get_num_generated_tokens()
+    if generated_text_len > max_gen_tokens:
+        log.error('Output token size is over max output token size!')
+    result_text = generated_text[0]
+    if args["output_dir"] is not None:
+        llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, 0, proc_id)
+    result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest())
+    if len(md5_list[num]) == 0:
+        md5_list[num] = {prompt_index : result_md5_list}
+    else:
+        md5_list[num][prompt_index] = result_md5_list
+    per_token_time = ""
+    if generated_text_len > 0:
+        per_token_time = generation_time * 1000 / (generated_text_len / args['batch_size'])
+    else:
+        log.warning("No generated tokens")
+    first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) * args["batch_size"]
+    second_tokens_durations = (
+        np.array(perf_metrics.raw_metrics.m_new_token_times[1:])
+        - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
+    ).tolist()
+
+    tm_list = np.array([first_token_time] + second_tokens_durations) / 1000
+    log.debug('latency of all tokens:')
+    [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
+    tokenization_time = (
+        np.mean(perf_metrics.raw_metrics.tokenization_durations) / 1000,
+        np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000
+    )
+    iter_data = gen_output_data.gen_iterate_data(
+        iter_idx=num,
+        in_size=args['batch_size'] * perf_metrics.get_num_input_tokens(),
+        infer_count=len(tm_list),
+        out_size=generated_text_len,
+        gen_time=generation_time,
+        latency=per_token_time,
+        res_md5=result_md5_list,
+        max_rss_mem=max_rss_mem_consumption,
+        max_shared_mem=max_shared_mem_consumption,
+        max_uss_mem=max_uss_mem_consumption,
+        prompt_idx=prompt_index,
+        tokenization_time=tokenization_time,
+        mm_embeddings_preparation_time=perf_metrics.get_prepare_embeddings_duration().mean
+    )
+    iter_data_list.append(iter_data)
+    metrics_print.print_metrics(
+        num,
+        iter_data,
+        tm_list.tolist(),
+        None,
+        warm_up=(num == 0),
+        max_rss_mem=max_rss_mem_consumption,
+        max_shared_mem=max_shared_mem_consumption,
+        max_uss_mem=max_uss_mem_consumption,
+        tokenization_time=tokenization_time,
+        batch_size=args['batch_size'],
+        prompt_idx=prompt_index
+    )
+    if num > 0:
+        prev_md5 = md5_list[num - 1][prompt_index]
+        if result_md5_list != prev_md5:
+            log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
+                        f"is different from md5 of the {num - 1} iteration {prev_md5}")
+            metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
+    else:
+        metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0], prompt_idx=prompt_index)
+
+
+def run_visual_language_generation_benchmark(model_path, framework, device, args, num_iters, mem_consumption):
+    model, processor, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_image_text_gen_model(model_path, device, **args)
+    model_precision = model_utils.get_model_precision(model_path.parts)
+    iter_data_list = []
+    md5_list = {num : {} for num in range(num_iters + 1)}
+    input_image_text_list = get_image_text_prompt(args)
+    if args['prompt_index'] is None:
+        prompt_idx_list = list(range(0, len(input_image_text_list)))
+        image_text_list = input_image_text_list
+    else:
+        prompt_idx_list = []
+        image_text_list = []
+        for i in args['prompt_index']:
+            if 0 <= i < len(input_image_text_list):
+                image_text_list.append(input_image_text_list[i])
+                prompt_idx_list.append(i)
+    if len(input_image_text_list) == 0:
+        raise RuntimeError('==Failure prompts is empty ==')
+    log.info(f"Numbeams: {args['num_beams']}, benchmarking iter nums(exclude warm-up): {num_iters}, "
+             f'prompt nums: {len(image_text_list)}, prompt idx: {prompt_idx_list}')
+
+    if not use_genai:
+        gen_fn = run_visual_language_generation_optimum
+    else:
+        gen_fn = run_visual_language_generation_genai
+
+    proc_id = os.getpid()
+    iter_timestamp = model_utils.init_timestamp(num_iters, image_text_list, prompt_idx_list)
+    if args['subsequent'] is False:
+        for num in range(num_iters + 1):
+            for idx, input_text in enumerate(image_text_list):
+                p_idx = prompt_idx_list[idx]
+                if num == 0:
+                    log.info(f'[warm-up][P{p_idx}] Input text: {input_text}')
+                iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
+                gen_fn(
+                    input_text, num, model, processor, args, iter_data_list, md5_list,
+                    p_idx, bench_hook, model_precision, proc_id, mem_consumption)
+                iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat()
+                prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
+                log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}")
+    else:
+        for idx, input_text in enumerate(image_text_list):
+            p_idx = prompt_idx_list[idx]
+            for num in range(num_iters + 1):
+                if num == 0:
+                    log.info(f'[warm-up][P{p_idx}] Input text: {input_text}')
+                iter_timestamp[num][p_idx]['start'] = datetime.datetime.now().isoformat()
+                gen_fn(
+                    input_text, num, model, processor, args, iter_data_list, md5_list,
+                    prompt_idx_list[idx], bench_hook, model_precision, proc_id, mem_consumption)
+                iter_timestamp[num][p_idx]['end'] = datetime.datetime.now().isoformat()
+                prefix = '[warm-up]' if num == 0 else '[{}]'.format(num)
+                log.info(f"{prefix}[P{p_idx}] start: {iter_timestamp[num][p_idx]['start']}, end: {iter_timestamp[num][p_idx]['end']}")
+
+    metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], True)
+    return iter_data_list, pretrain_time, iter_timestamp
+
+
+def get_image_text_prompt(args):
+    vlm_file_list = []
+    output_data_list, is_json_data = model_utils.get_param_from_file(args, ['media', "prompt"])
+    if is_json_data:
+        vlm_param_list = parse_json_data.parse_vlm_json_data(output_data_list)
+        if len(vlm_param_list) > 0:
+            for vlm_file in vlm_param_list:
+                if args['prompt_file'] is not None and len(args['prompt_file']) > 0:
+                    vlm_file['media'] = os.path.join(os.path.dirname(args['prompt_file'][0]), vlm_file['media'].replace('./', ''))
+                    vlm_file['media'] = Path(vlm_file['media'])
+                vlm_file_list.append(vlm_file)
+    else:
+        vlm_file_list.append(output_data_list)
+    return vlm_file_list