From d17f7168f278ef98acfdc7ba1ac93e4c759a6402 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 13 Dec 2024 08:03:11 +0400
Subject: [PATCH] [Image generation] Added num_steps to callback (#1372)

With image to image and inpainting, an user passed `num_inference_steps`
is scaled based on `strength` parameter.
So, we need to report actual number of steps within `callback`

CC @RyanMetcalfeInt8
---
 samples/cpp/image_generation/README.md        |  6 ++---
 samples/python/image_generation/README.md     |  6 ++---
 .../image_generation/generation_config.hpp    |  4 ++--
 .../src/image_generation/flux_pipeline.hpp    | 22 ++++++-------------
 .../stable_diffusion_3_pipeline.hpp           | 21 ++++++++----------
 .../stable_diffusion_pipeline.hpp             | 13 +++++------
 src/python/py_utils.cpp                       |  2 +-
 tools/llm_bench/llm_bench_utils/ov_utils.py   |  2 +-
 8 files changed, 31 insertions(+), 45 deletions(-)
diff --git a/samples/cpp/image_generation/README.md b/samples/cpp/image_generation/README.md
index 795bea8999..8a5cc5aa19 100644
--- a/samples/cpp/image_generation/README.md
+++ b/samples/cpp/image_generation/README.md
@@ -52,9 +52,9 @@ Please find the template of the callback usage below.
 ```cpp
 ov::genai::Text2ImagePipeline pipe(models_path, device);
 
-auto callback = [&](size_t step, ov::Tensor& intermediate_res) -> bool {
-   std::cout << "Image generation step: " << step << std::endl;
-   ov::Tensor img = pipe.decode(intermediate_res); // get intermediate image tensor
+auto callback = [&](size_t step, size_t num_steps, ov::Tensor& latent) -> bool {
+   std::cout << "Image generation step: " << step << " / " << num_steps << std::endl;
+   ov::Tensor img = pipe.decode(latent); // get intermediate image tensor
    if (your_condition) // return true if you want to interrupt image generation
       return true;
    return false;
diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md
index 4abe45b2b4..321f3f6d05 100644
--- a/samples/python/image_generation/README.md
+++ b/samples/python/image_generation/README.md
@@ -52,9 +52,9 @@ Please find the template of the callback usage below.
 ```python
 pipe = openvino_genai.Text2ImagePipeline(model_dir, device)
 
-def callback(step, intermediate_res):
-   print("Image generation step: ", step)
-   image_tensor = pipe.decode(intermediate_res) # get intermediate image tensor
+def callback(step, num_steps, latent):
+   print(f"Image generation step: {step} / {num_steps}")
+   image_tensor = pipe.decode(latent) # get intermediate image tensor
    if your_condition: # return True if you want to interrupt image generation
       return True
    return False
diff --git a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
index 0b749ecd83..50e576466d 100644
--- a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp
@@ -216,11 +216,11 @@ static constexpr ov::Property<int> max_sequence_length{"max_sequence_length"};
 
 /**
  * User callback for image generation pipelines, which is called within a pipeline with the following arguments:
- * - Total number of inference steps. Note, that in case of 'strength' parameter, the number of inference steps is reduced linearly
  * - Current inference step
+ * - Total number of inference steps. Note, that in case of 'strength' parameter, the number of inference steps is reduced linearly
  * - Tensor representing current latent. Such latent can be converted to human-readable representation via image generation pipeline 'decode()' method
  */
-static constexpr ov::Property<std::function<bool(size_t, ov::Tensor&)>> callback{"callback"};
+static constexpr ov::Property<std::function<bool(size_t, size_t, ov::Tensor&)>> callback{"callback"};
 
 /**
  * Function to pass 'ImageGenerationConfig' as property to 'generate()' call.
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index 4cdac5bb1a..ac82bd0cab 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -326,9 +326,11 @@ class FluxPipeline : public DiffusionPipeline {
             m_custom_generation_config.strength = 1.0f;
         }
 
-        if (!initial_image) {
-            // in case of typical text to image generation, we need to ignore 'strength'
-            m_custom_generation_config.strength = 1.0f;
+        // Use callback if defined
+        std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
+        auto callback_iter = properties.find(ov::genai::callback.name());
+        if (callback_iter != properties.end()) {
+            callback = callback_iter->second.as<std::function<bool(size_t, size_t, ov::Tensor&)>>();
         }
 
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
@@ -355,14 +357,6 @@ class FluxPipeline : public DiffusionPipeline {
         m_scheduler->set_timesteps_with_sigma(sigmas, mu);
         std::vector<float> timesteps = m_scheduler->get_float_timesteps();
 
-        // Use callback if defined
-        std::function<bool(size_t, ov::Tensor&)> callback;
-        auto callback_iter = properties.find(ov::genai::callback.name());
-        bool do_callback = callback_iter != properties.end();
-        if (do_callback) {
-            callback = callback_iter->second.as<std::function<bool(size_t, ov::Tensor&)>>();
-        }
-
         // 6. Denoising loop
         ov::Tensor timestep(ov::element::f32, {1});
         float* timestep_data = timestep.data<float>();
@@ -375,10 +369,8 @@ class FluxPipeline : public DiffusionPipeline {
             auto scheduler_step_result = m_scheduler->step(noise_pred_tensor, latents, inference_step, m_custom_generation_config.generator);
             latents = scheduler_step_result["latent"];
 
-            if (do_callback) {
-                if (callback(inference_step, latents)) {
-                    return ov::Tensor(ov::element::u8, {});
-                }
+            if (callback && callback(inference_step, timesteps.size(), latents)) {
+                return ov::Tensor(ov::element::u8, {});
             }
         }
 
diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
index 4e9a70ec2d..3cdaa409d1 100644
--- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -431,6 +431,13 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
             generation_config.strength = 1.0f;
         }
 
+        // Use callback if defined
+        std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
+        auto callback_iter = properties.find(ov::genai::callback.name());
+        if (callback_iter != properties.end()) {
+            callback = callback_iter->second.as<std::function<bool(size_t, size_t, ov::Tensor&)>>();
+        }
+
         const auto& transformer_config = m_transformer->get_config();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale)
@@ -467,14 +474,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         // 6. Denoising loop
         ov::Tensor noisy_residual_tensor(ov::element::f32, {});
 
-        // Use callback if defined
-        std::function<bool(size_t, ov::Tensor&)> callback;
-        auto callback_iter = properties.find(ov::genai::callback.name());
-        bool do_callback = callback_iter != properties.end();
-        if (do_callback) {
-            callback = callback_iter->second.as<std::function<bool(size_t, ov::Tensor&)>>();
-        }
-
         for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) {
             // concat the same latent twice along a batch dimension in case of CFG
             if (batch_size_multiplier > 1) {
@@ -510,10 +509,8 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
             auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step, generation_config.generator);
             latent = scheduler_step_result["latent"];
 
-            if (do_callback) {
-                if (callback(inference_step, latent)) {
-                    return ov::Tensor(ov::element::u8, {});
-                }
+            if (callback && callback(inference_step, timesteps.size(), latent)) {
+                return ov::Tensor(ov::element::u8, {});
             }
         }
 
diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
index 9dbdbac088..c53c9b7d25 100644
--- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -306,11 +306,10 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         }
 
         // use callback if defined
-        std::function<bool(size_t, ov::Tensor&)> callback;
+        std::function<bool(size_t, size_t, ov::Tensor&)> callback = nullptr;
         auto callback_iter = properties.find(ov::genai::callback.name());
-        bool do_callback = callback_iter != properties.end();
-        if (do_callback) {
-            callback = callback_iter->second.as<std::function<bool(size_t, ov::Tensor&)>>();
+        if (callback_iter != properties.end()) {
+            callback = callback_iter->second.as<std::function<bool(size_t, size_t, ov::Tensor&)>>();
         }
 
         // Stable Diffusion pipeline
@@ -400,10 +399,8 @@ class StableDiffusionPipeline : public DiffusionPipeline {
             const auto it = scheduler_step_result.find("denoised");
             denoised = it != scheduler_step_result.end() ? it->second : latent;
 
-            if (do_callback) {
-                if (callback(inference_step, denoised)) {
-                    return ov::Tensor(ov::element::u8, {});
-                }
+            if (callback && callback(inference_step, timesteps.size(), denoised)) {
+                return ov::Tensor(ov::element::u8, {});
             }
         }
 
diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
index 9d33318f0a..45a0c46174 100644
--- a/src/python/py_utils.cpp
+++ b/src/python/py_utils.cpp
@@ -280,7 +280,7 @@ ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) {
     } else if (py::isinstance<ov::genai::Generator>(py_obj)) {
         return py::cast<std::shared_ptr<ov::genai::Generator>>(py_obj);
     } else if (py::isinstance<py::function>(py_obj) && property_name == "callback") {
-        return py::cast<std::function<bool(size_t, ov::Tensor&)>>(py_obj);
+        return py::cast<std::function<bool(size_t, size_t, ov::Tensor&)>>(py_obj);
     } else if ((py::isinstance<py::function>(py_obj) || py::isinstance<ov::genai::StreamerBase>(py_obj) || py::isinstance<std::monostate>(py_obj)) && property_name == "streamer") {
         auto streamer = py::cast<ov::genai::pybind::utils::PyBindStreamerVariant>(py_obj);
         return ov::genai::streamer(pystreamer_to_streamer(streamer)).second;
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index c5fa422824..8a28fbe355 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -366,7 +366,7 @@ def __init__(self) -> types.NoneType:
             self.start_time = time.perf_counter()
             self.duration = -1
 
-        def __call__(self, step, latents):
+        def __call__(self, step, num_steps, latents):
             self.iteration_time.append(time.perf_counter() - self.start_time)
             self.start_time = time.perf_counter()
             return False