diff --git a/src/cpp/src/image_generation/diffusion_pipeline.hpp b/src/cpp/src/image_generation/diffusion_pipeline.hpp index 786115e2f1..42e35101cc 100644 --- a/src/cpp/src/image_generation/diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/diffusion_pipeline.hpp @@ -80,6 +80,8 @@ class DiffusionPipeline { virtual ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const = 0; + virtual void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) = 0; + virtual ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties) = 0; virtual ov::Tensor decode(const ov::Tensor latent) = 0; diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp index 20a7afd432..101401d434 100644 --- a/src/cpp/src/image_generation/flux_pipeline.hpp +++ b/src/cpp/src/image_generation/flux_pipeline.hpp @@ -237,6 +237,43 @@ class FluxPipeline : public DiffusionPipeline { m_vae->compile(device, properties); m_transformer->compile(device, properties); } + + void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { + // encode_prompt + std::string prompt_2_str = + generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt; + + m_clip_text_encoder->infer(positive_prompt, "", false); + ov::Tensor pooled_prompt_embeds_out = m_clip_text_encoder->get_output_tensor(1); + + ov::Tensor prompt_embeds_out = m_t5_text_encoder->infer(prompt_2_str, generation_config.max_sequence_length); + + ov::Tensor pooled_prompt_embeds, prompt_embeds; + if (generation_config.num_images_per_prompt == 1) { + pooled_prompt_embeds = pooled_prompt_embeds_out; + prompt_embeds = prompt_embeds_out; + } else { + pooled_prompt_embeds = numpy_utils::repeat(pooled_prompt_embeds_out, generation_config.num_images_per_prompt); + prompt_embeds = numpy_utils::repeat(prompt_embeds_out, generation_config.num_images_per_prompt); + } + + // text_ids = torch.zeros(prompt_embeds.shape[1], 3) + ov::Shape text_ids_shape = {prompt_embeds.get_shape()[1], 3}; + ov::Tensor text_ids(ov::element::f32, text_ids_shape); + std::fill_n(text_ids.data(), text_ids_shape[0] * text_ids_shape[1], 0.0f); + + const size_t num_channels_latents = m_transformer->get_config().in_channels / 4; + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); + size_t height = generation_config.height / vae_scale_factor; + size_t width = generation_config.width / vae_scale_factor; + + ov::Tensor latent_image_ids = prepare_latent_image_ids(generation_config.num_images_per_prompt, height / 2, width / 2); + + m_transformer->set_hidden_states("pooled_projections", pooled_prompt_embeds); + m_transformer->set_hidden_states("encoder_hidden_states", prompt_embeds); + m_transformer->set_hidden_states("txt_ids", text_ids); + m_transformer->set_hidden_states("img_ids", latent_image_ids); + } ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { @@ -260,10 +297,14 @@ class FluxPipeline : public DiffusionPipeline { ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties) override { - using namespace numpy_utils; ImageGenerationConfig generation_config = m_generation_config; generation_config.update_generation_config(properties); + if (!initial_image) { + // in case of typical text to image generation, we need to ignore 'strength' + generation_config.strength = 1.0f; + } + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); const auto& transformer_config = m_transformer->get_config(); @@ -274,50 +315,18 @@ class FluxPipeline : public DiffusionPipeline { check_inputs(generation_config, initial_image); - // encode_prompt - std::string prompt_2_str = - generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt; - - m_clip_text_encoder->infer(positive_prompt, "", false); - ov::Tensor pooled_prompt_embeds_out = m_clip_text_encoder->get_output_tensor(1); - - ov::Tensor prompt_embeds_out = m_t5_text_encoder->infer(positive_prompt, generation_config.max_sequence_length); - - ov::Tensor pooled_prompt_embeds, prompt_embeds; - if (generation_config.num_images_per_prompt == 1) { - pooled_prompt_embeds = pooled_prompt_embeds_out; - prompt_embeds = prompt_embeds_out; - } else { - pooled_prompt_embeds = repeat(pooled_prompt_embeds_out, generation_config.num_images_per_prompt); - prompt_embeds = repeat(prompt_embeds_out, generation_config.num_images_per_prompt); - } - - // text_ids = torch.zeros(prompt_embeds.shape[1], 3) - ov::Shape text_ids_shape = {prompt_embeds.get_shape()[1], 3}; - ov::Tensor text_ids(ov::element::f32, text_ids_shape); - std::fill_n(text_ids.data(), text_ids_shape[0] * text_ids_shape[1], 0.0f); - - size_t num_channels_latents = m_transformer->get_config().in_channels / 4; - size_t height = generation_config.height / vae_scale_factor; - size_t width = generation_config.width / vae_scale_factor; + compute_hidden_states(positive_prompt, generation_config); ov::Tensor latents = prepare_latents(initial_image, generation_config); - ov::Tensor latent_image_ids = prepare_latent_image_ids(generation_config.num_images_per_prompt, height / 2, width / 2); - - m_transformer->set_hidden_states("pooled_projections", pooled_prompt_embeds); - m_transformer->set_hidden_states("encoder_hidden_states", prompt_embeds); - m_transformer->set_hidden_states("txt_ids", text_ids); - m_transformer->set_hidden_states("img_ids", latent_image_ids); size_t image_seq_len = latents.get_shape()[1]; float mu = m_scheduler->calculate_shift(image_seq_len); float linspace_end = 1.0f / generation_config.num_inference_steps; - std::vector sigmas = linspace(1.0f, linspace_end, generation_config.num_inference_steps, true); + std::vector sigmas = numpy_utils::linspace(1.0f, linspace_end, generation_config.num_inference_steps, true); m_scheduler->set_timesteps_with_sigma(sigmas, mu); std::vector timesteps = m_scheduler->get_float_timesteps(); - size_t num_inference_steps = timesteps.size(); // Use callback if defined std::function callback; @@ -331,7 +340,7 @@ class FluxPipeline : public DiffusionPipeline { ov::Tensor timestep(ov::element::f32, {1}); float* timestep_data = timestep.data(); - for (size_t inference_step = 0; inference_step < num_inference_steps; ++inference_step) { + for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) { timestep_data[0] = timesteps[inference_step] / 1000; ov::Tensor noise_pred_tensor = m_transformer->infer(latents, timestep); diff --git a/src/cpp/src/image_generation/generation_config.cpp b/src/cpp/src/image_generation/generation_config.cpp index 89ff21ffe2..938034f628 100644 --- a/src/cpp/src/image_generation/generation_config.cpp +++ b/src/cpp/src/image_generation/generation_config.cpp @@ -1,8 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "image_generation/stable_diffusion_pipeline.hpp" -#include "image_generation/stable_diffusion_xl_pipeline.hpp" +#include "openvino/genai/image_generation/generation_config.hpp" #include #include diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp index 71d9fdd6ff..30c8831980 100644 --- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp @@ -219,59 +219,9 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { m_vae->compile(device, properties); } - ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { - const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); - ov::Shape latent_shape{generation_config.num_images_per_prompt, - m_transformer->get_config().in_channels, - generation_config.height / vae_scale_factor, - generation_config.width / vae_scale_factor}; - - ov::Tensor latent(ov::element::f32, {}); - - if (initial_image) { - OPENVINO_THROW("StableDiffusion3 image to image is not implemented"); - } else { - latent = generation_config.generator->randn_tensor(latent_shape); - - // latents are multiplied by 'init_noise_sigma' - float * latent_data = latent.data(); - for (size_t i = 0; i < latent.get_size(); ++i) - latent_data[i] *= m_scheduler->get_init_noise_sigma(); - } - - return latent; - } - - ov::Tensor generate(const std::string& positive_prompt, - ov::Tensor initial_image, - const ov::AnyMap& properties) override { - using namespace numpy_utils; - ImageGenerationConfig generation_config = m_generation_config; - generation_config.update_generation_config(properties); - - if (!initial_image) { - // in case of typical text to image generation, we need to ignore 'strength' - generation_config.strength = 1.0f; - } - + void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { const auto& transformer_config = m_transformer->get_config(); - const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) - ? 2 - : 1; // Transformer accepts 2x batch in case of CFG - - const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); - - if (generation_config.height < 0) - generation_config.height = transformer_config.sample_size * vae_scale_factor; - if (generation_config.width < 0) - generation_config.width = transformer_config.sample_size * vae_scale_factor; - - check_inputs(generation_config, initial_image); - - if (generation_config.generator == nullptr) { - uint32_t seed = time(NULL); - generation_config.generator = std::make_shared(seed); - } + const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Transformer accepts 2x batch in case of CFG // Input tensors for transformer model ov::Tensor prompt_embeds_inp, pooled_prompt_embeds_inp; @@ -334,10 +284,10 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { pooled_prompt_2_embed = pooled_prompt_2_embed_out; prompt_2_embed = prompt_2_embed_out; } else { - pooled_prompt_embed = repeat(pooled_prompt_embed_out, generation_config.num_images_per_prompt); - prompt_embed = repeat(prompt_embed_out, generation_config.num_images_per_prompt); - pooled_prompt_2_embed = repeat(pooled_prompt_2_embed_out, generation_config.num_images_per_prompt); - prompt_2_embed = repeat(prompt_2_embed_out, generation_config.num_images_per_prompt); + pooled_prompt_embed = numpy_utils::repeat(pooled_prompt_embed_out, generation_config.num_images_per_prompt); + prompt_embed = numpy_utils::repeat(prompt_embed_out, generation_config.num_images_per_prompt); + pooled_prompt_2_embed = numpy_utils::repeat(pooled_prompt_2_embed_out, generation_config.num_images_per_prompt); + prompt_2_embed = numpy_utils::repeat(prompt_2_embed_out, generation_config.num_images_per_prompt); } // concatenate hidden_states from two encoders @@ -351,7 +301,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { const float* pr_emb_2_data = prompt_2_embed.data(); float* clip_prompt_embeds_data = clip_prompt_embeds.data(); - concat_3d_by_rows(pr_emb_1_data, pr_emb_2_data, clip_prompt_embeds_data, pr_emb_shape, pr_emb_2_shape); + numpy_utils::concat_3d_by_rows(pr_emb_1_data, pr_emb_2_data, clip_prompt_embeds_data, pr_emb_shape, pr_emb_2_shape); // TODO: text_encoder_3 ov::Shape t5_prompt_embed_shape = {generation_config.num_images_per_prompt, @@ -374,7 +324,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { pad_embeds_shape[2]}; ov::Tensor prompt_embeds(ov::element::f32, prompt_embeds_shape); float* prompt_embeds_data = prompt_embeds.data(); - concat_3d_by_cols(pad_embeds.data(), + numpy_utils::concat_3d_by_cols(pad_embeds.data(), t5_prompt_embed.data(), prompt_embeds_data, pad_embeds_shape, @@ -391,11 +341,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { ov::Tensor pooled_prompt_embeds(ov::element::f32, pooled_prompt_embeds_shape); float* pooled_prompt_embeds_data = pooled_prompt_embeds.data(); - concat_2d_by_rows(pooled_prompt_embed_data, - pooled_prompt_2_embed_data, - pooled_prompt_embeds_data, - p_pr_emb_shape, - p_pr_emb_2_shape); + numpy_utils::concat_2d_by_rows(pooled_prompt_embed_data, + pooled_prompt_2_embed_data, + pooled_prompt_embeds_data, + p_pr_emb_shape, + p_pr_emb_2_shape); // From steps above we'll use prompt_embeds and pooled_prompt_embeds tensors if (do_classifier_free_guidance(generation_config.guidance_scale)) { @@ -414,10 +364,10 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { negative_pooled_prompt_2_embed = negative_pooled_prompt_2_embed_out; negative_prompt_2_embed = negative_prompt_2_embed_out; } else { - negative_pooled_prompt_embed = repeat(negative_pooled_prompt_embed_out, generation_config.num_images_per_prompt); - negative_prompt_embed = repeat(negative_prompt_embed_out, generation_config.num_images_per_prompt); - negative_pooled_prompt_2_embed = repeat(negative_pooled_prompt_2_embed_out, generation_config.num_images_per_prompt); - negative_prompt_2_embed = repeat(negative_prompt_2_embed_out, generation_config.num_images_per_prompt); + negative_pooled_prompt_embed = numpy_utils::repeat(negative_pooled_prompt_embed_out, generation_config.num_images_per_prompt); + negative_prompt_embed = numpy_utils::repeat(negative_prompt_embed_out, generation_config.num_images_per_prompt); + negative_pooled_prompt_2_embed = numpy_utils::repeat(negative_pooled_prompt_2_embed_out, generation_config.num_images_per_prompt); + negative_prompt_2_embed = numpy_utils::repeat(negative_prompt_2_embed_out, generation_config.num_images_per_prompt); } // concatenate hidden_states from two encoders @@ -433,11 +383,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { const float* neg_pr_emb_2_data = negative_prompt_2_embed.data(); float* neg_clip_prompt_embeds_data = neg_clip_prompt_embeds.data(); - concat_3d_by_rows(neg_pr_emb_1_data, - neg_pr_emb_2_data, - neg_clip_prompt_embeds_data, - n_pr_emb_1_shape, - n_pr_emb_2_shape); + numpy_utils::concat_3d_by_rows(neg_pr_emb_1_data, + neg_pr_emb_2_data, + neg_clip_prompt_embeds_data, + n_pr_emb_1_shape, + n_pr_emb_2_shape); std::vector t5_neg_prompt_embed( t5_prompt_embed_shape[0] * t5_prompt_embed_shape[1] * t5_prompt_embed_shape[2], @@ -464,11 +414,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { ov::Tensor neg_prompt_embeds(ov::element::f32, neg_prompt_embeds_shape); float* neg_prompt_embeds_data = neg_prompt_embeds.data(); - concat_3d_by_cols(neg_pad_embeds.data(), - t5_neg_prompt_embed.data(), - neg_prompt_embeds_data, - neg_pad_embeds_shape, - t5_prompt_embed_shape); + numpy_utils::concat_3d_by_cols(neg_pad_embeds.data(), + t5_neg_prompt_embed.data(), + neg_prompt_embeds_data, + neg_pad_embeds_shape, + t5_prompt_embed_shape); // neg_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], // dim=-1) @@ -483,11 +433,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { ov::Tensor neg_pooled_prompt_embeds(ov::element::f32, neg_pooled_prompt_embeds_shape); float* neg_pooled_prompt_embeds_data = neg_pooled_prompt_embeds.data(); - concat_2d_by_rows(neg_pooled_pr_emb_data, - neg_pooled_pr_2_emb_data, - neg_pooled_prompt_embeds_data, - neg_pooled_pr_emb_shape, - neg_pooled_pr_2_emb_shape); + numpy_utils::concat_2d_by_rows(neg_pooled_pr_emb_data, + neg_pooled_pr_2_emb_data, + neg_pooled_prompt_embeds_data, + neg_pooled_pr_emb_shape, + neg_pooled_pr_2_emb_shape); // From steps above we'll use neg_prompt_embeds and neg_pooled_prompt_embeds tensors // Fill in transformer inputs: concat positive and negative prompt_embeds @@ -496,11 +446,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { prompt_embeds_shape[2]}; prompt_embeds_inp = ov::Tensor(ov::element::f32, prompt_embeds_inp_shape); float* prompt_embeds_inp_data = prompt_embeds_inp.data(); - concat_3d_by_channels(neg_prompt_embeds_data, - prompt_embeds_data, - prompt_embeds_inp_data, - neg_prompt_embeds_shape, - prompt_embeds_shape); + numpy_utils::concat_3d_by_channels(neg_prompt_embeds_data, + prompt_embeds_data, + prompt_embeds_inp_data, + neg_prompt_embeds_shape, + prompt_embeds_shape); ov::Shape pooled_prompt_embeds_inp_shape = { neg_pooled_prompt_embeds_shape[0] + pooled_prompt_embeds_shape[0], @@ -508,24 +458,80 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { pooled_prompt_embeds_inp = ov::Tensor(ov::element::f32, pooled_prompt_embeds_inp_shape); float* pooled_prompt_embeds_input_data = pooled_prompt_embeds_inp.data(); - concat_2d_by_channels(neg_pooled_prompt_embeds_data, - pooled_prompt_embeds_data, - pooled_prompt_embeds_input_data, - neg_pooled_prompt_embeds_shape, - pooled_prompt_embeds_shape); + numpy_utils::concat_2d_by_channels(neg_pooled_prompt_embeds_data, + pooled_prompt_embeds_data, + pooled_prompt_embeds_input_data, + neg_pooled_prompt_embeds_shape, + pooled_prompt_embeds_shape); } else { // Fill in transformer inputs prompt_embeds_inp = prompt_embeds; pooled_prompt_embeds_inp = pooled_prompt_embeds; } + // 4. Set model inputs + m_transformer->set_hidden_states("encoder_hidden_states", prompt_embeds_inp); + m_transformer->set_hidden_states("pooled_projections", pooled_prompt_embeds_inp); + } + + ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); + ov::Shape latent_shape{generation_config.num_images_per_prompt, + m_transformer->get_config().in_channels, + generation_config.height / vae_scale_factor, + generation_config.width / vae_scale_factor}; + + ov::Tensor latent(ov::element::f32, {}); + + if (initial_image) { + OPENVINO_THROW("StableDiffusion3 image to image is not implemented"); + } else { + latent = generation_config.generator->randn_tensor(latent_shape); + + // latents are multiplied by 'init_noise_sigma' + float * latent_data = latent.data(); + for (size_t i = 0; i < latent.get_size(); ++i) + latent_data[i] *= m_scheduler->get_init_noise_sigma(); + } + + return latent; + } + + ov::Tensor generate(const std::string& positive_prompt, + ov::Tensor initial_image, + const ov::AnyMap& properties) override { + ImageGenerationConfig generation_config = m_generation_config; + generation_config.update_generation_config(properties); + + if (!initial_image) { + // in case of typical text to image generation, we need to ignore 'strength' + generation_config.strength = 1.0f; + } + + const auto& transformer_config = m_transformer->get_config(); + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); + const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) + ? 2 + : 1; // Transformer accepts 2x batch in case of CFG + + if (generation_config.height < 0) + generation_config.height = transformer_config.sample_size * vae_scale_factor; + if (generation_config.width < 0) + generation_config.width = transformer_config.sample_size * vae_scale_factor; + + check_inputs(generation_config, initial_image); + + if (generation_config.generator == nullptr) { + uint32_t seed = time(NULL); + generation_config.generator = std::make_shared(seed); + } + // 3. Prepare timesteps m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength); std::vector timesteps = m_scheduler->get_float_timesteps(); - // 4. Set model inputs - m_transformer->set_hidden_states("encoder_hidden_states", prompt_embeds_inp); - m_transformer->set_hidden_states("pooled_projections", pooled_prompt_embeds_inp); + // 4 compute text encoders and set hidden states + compute_hidden_states(positive_prompt, generation_config); // 5. Prepare latent variables ov::Tensor latent = prepare_latents(initial_image, generation_config); @@ -548,8 +554,8 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) { // concat the same latent twice along a batch dimension in case of CFG if (batch_size_multiplier > 1) { - batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); - batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); + numpy_utils::batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); + numpy_utils::batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); } else { // just assign to save memory copy latent_cfg = latent; @@ -587,7 +593,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { } } - return m_vae->decode(latent); + return decode(latent); } ov::Tensor decode(const ov::Tensor latent) override { diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp index ecf810827c..629c99270c 100644 --- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp @@ -138,8 +138,41 @@ class StableDiffusionPipeline : public DiffusionPipeline { m_vae->compile(device, properties); } + void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { + const auto& unet_config = m_unet->get_config(); + const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + + std::string negative_prompt = generation_config.negative_prompt != std::nullopt ? *generation_config.negative_prompt : std::string{}; + ov::Tensor encoder_hidden_states = m_clip_text_encoder->infer(positive_prompt, negative_prompt, + batch_size_multiplier > 1); + + // replicate encoder hidden state to UNet model + if (generation_config.num_images_per_prompt == 1) { + // reuse output of text encoder directly w/o extra memory copy + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states); + } else { + ov::Shape enc_shape = encoder_hidden_states.get_shape(); + enc_shape[0] *= generation_config.num_images_per_prompt; + + ov::Tensor encoder_hidden_states_repeated(encoder_hidden_states.get_element_type(), enc_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + numpy_utils::batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, 0, n); + if (batch_size_multiplier > 1) { + numpy_utils::batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states_repeated); + } + + if (unet_config.time_cond_proj_dim >= 0) { // LCM + ov::Tensor timestep_cond = get_guidance_scale_embedding(generation_config.guidance_scale - 1.0f, unet_config.time_cond_proj_dim); + m_unet->set_hidden_states("timestep_cond", timestep_cond); + } + } + ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { - using namespace numpy_utils; const auto& unet_config = m_unet->get_config(); const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); @@ -152,7 +185,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { if (generation_config.num_images_per_prompt > 1) { ov::Tensor batched_latent(ov::element::f32, latent_shape); for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { - batch_copy(latent, batched_latent, 0, n); + numpy_utils::batch_copy(latent, batched_latent, 0, n); } latent = batched_latent; } @@ -202,38 +235,12 @@ class StableDiffusionPipeline : public DiffusionPipeline { generation_config.generator = std::make_shared(seed); } - std::string negative_prompt = generation_config.negative_prompt != std::nullopt ? *generation_config.negative_prompt : std::string{}; - ov::Tensor encoder_hidden_states = m_clip_text_encoder->infer(positive_prompt, negative_prompt, - batch_size_multiplier > 1); - - // replicate encoder hidden state to UNet model - if (generation_config.num_images_per_prompt == 1) { - // reuse output of text encoder directly w/o extra memory copy - m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states); - } else { - ov::Shape enc_shape = encoder_hidden_states.get_shape(); - enc_shape[0] *= generation_config.num_images_per_prompt; - - ov::Tensor encoder_hidden_states_repeated(encoder_hidden_states.get_element_type(), enc_shape); - for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { - batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, 0, n); - if (batch_size_multiplier > 1) { - batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, - 1, generation_config.num_images_per_prompt + n); - } - } - - m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states_repeated); - } - - if (unet_config.time_cond_proj_dim >= 0) { // LCM - ov::Tensor timestep_cond = get_guidance_scale_embedding(generation_config.guidance_scale - 1.0f, unet_config.time_cond_proj_dim); - m_unet->set_hidden_states("timestep_cond", timestep_cond); - } - m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength); std::vector timesteps = m_scheduler->get_timesteps(); + // compute text encoders and set hidden states + compute_hidden_states(positive_prompt, generation_config); + // preparate initial latents ov::Tensor latent = prepare_latents(initial_image, generation_config); @@ -252,10 +259,10 @@ class StableDiffusionPipeline : public DiffusionPipeline { ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); for (size_t inference_step = 0; inference_step < timesteps.size(); inference_step++) { - batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); + numpy_utils::batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); // concat the same latent twice along a batch dimension in case of CFG if (batch_size_multiplier > 1) { - batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); + numpy_utils::batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); } m_scheduler->scale_model_input(latent_cfg, inference_step); @@ -296,7 +303,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { } } - return m_vae->decode(denoised); + return decode(denoised); } ov::Tensor decode(const ov::Tensor latent) override { diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp index 70f69d37fa..e7c8c35ce3 100644 --- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp @@ -168,70 +168,9 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { m_vae->compile(device, properties); } - ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { - using namespace numpy_utils; - const auto& unet_config = m_unet->get_config(); - const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); - - ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, - generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; - ov::Tensor latent; - - if (initial_image) { - latent = m_vae->encode(initial_image, generation_config.generator); - if (generation_config.num_images_per_prompt > 1) { - ov::Tensor batched_latent(ov::element::f32, latent_shape); - for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { - batch_copy(latent, batched_latent, 0, n); - } - latent = batched_latent; - } - m_scheduler->add_noise(latent, generation_config.generator); - } else { - latent = generation_config.generator->randn_tensor(latent_shape); - - // latents are multiplied by 'init_noise_sigma' - float * latent_data = latent.data(); - for (size_t i = 0; i < latent.get_size(); ++i) - latent_data[i] *= m_scheduler->get_init_noise_sigma(); - } - - return latent; - } - - ov::Tensor generate(const std::string& positive_prompt, - ov::Tensor initial_image, - const ov::AnyMap& properties) override { - using namespace numpy_utils; - ImageGenerationConfig generation_config = m_generation_config; - generation_config.update_generation_config(properties); - - if (!initial_image) { - // in case of typical text to image generation, we need to ignore 'strength' - generation_config.strength = 1.0f; - } - - // Stable Diffusion pipeline - // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline - + void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { const auto& unet_config = m_unet->get_config(); const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG - const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); - - if (generation_config.height < 0) - generation_config.height = unet_config.sample_size * vae_scale_factor; - if (generation_config.width < 0) - generation_config.width = unet_config.sample_size * vae_scale_factor; - check_inputs(generation_config, initial_image); - - m_clip_text_encoder->set_adapters(generation_config.adapters); - m_clip_text_encoder_with_projection->set_adapters(generation_config.adapters); - m_unet->set_adapters(generation_config.adapters); - - if (generation_config.generator == nullptr) { - uint32_t seed = time(NULL); - generation_config.generator = std::make_shared(seed); - } std::vector time_ids = {static_cast(generation_config.width), static_cast(generation_config.height), @@ -353,9 +292,9 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { ov::Tensor encoder_hidden_states_repeated(encoder_hidden_states.get_element_type(), enc_shape); for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { - batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, 0, n); + numpy_utils::batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, 0, n); if (batch_size_multiplier > 1) { - batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, + numpy_utils::batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, 1, generation_config.num_images_per_prompt + n); } } @@ -367,9 +306,9 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { ov::Tensor add_text_embeds_repeated(add_text_embeds.get_element_type(), t_emb_shape); for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { - batch_copy(add_text_embeds, add_text_embeds_repeated, 0, n); + numpy_utils::batch_copy(add_text_embeds, add_text_embeds_repeated, 0, n); if (batch_size_multiplier > 1) { - batch_copy(add_text_embeds, add_text_embeds_repeated, + numpy_utils::batch_copy(add_text_embeds, add_text_embeds_repeated, 1, generation_config.num_images_per_prompt + n); } } @@ -380,9 +319,9 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { t_ids_shape[0] *= generation_config.num_images_per_prompt; ov::Tensor add_time_ids_repeated(add_time_ids.get_element_type(), t_ids_shape); for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { - batch_copy(add_time_ids, add_time_ids_repeated, 0, n); + numpy_utils::batch_copy(add_time_ids, add_time_ids_repeated, 0, n); if (batch_size_multiplier > 1) { - batch_copy(add_time_ids, add_time_ids_repeated, + numpy_utils::batch_copy(add_time_ids, add_time_ids_repeated, 1, generation_config.num_images_per_prompt + n); } } @@ -394,13 +333,81 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { ov::Tensor timestep_cond = get_guidance_scale_embedding(generation_config.guidance_scale - 1.0f, unet_config.time_cond_proj_dim); m_unet->set_hidden_states("timestep_cond", timestep_cond); } + } + + ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { + const auto& unet_config = m_unet->get_config(); + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); + + ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, + generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; + ov::Tensor latent; + + if (initial_image) { + latent = m_vae->encode(initial_image, generation_config.generator); + if (generation_config.num_images_per_prompt > 1) { + ov::Tensor batched_latent(ov::element::f32, latent_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + numpy_utils::batch_copy(latent, batched_latent, 0, n); + } + latent = batched_latent; + } + m_scheduler->add_noise(latent, generation_config.generator); + } else { + latent = generation_config.generator->randn_tensor(latent_shape); + + // latents are multiplied by 'init_noise_sigma' + float * latent_data = latent.data(); + for (size_t i = 0; i < latent.get_size(); ++i) + latent_data[i] *= m_scheduler->get_init_noise_sigma(); + } + + return latent; + } + + ov::Tensor generate(const std::string& positive_prompt, + ov::Tensor initial_image, + const ov::AnyMap& properties) override { + ImageGenerationConfig generation_config = m_generation_config; + generation_config.update_generation_config(properties); + + if (!initial_image) { + // in case of typical text to image generation, we need to ignore 'strength' + generation_config.strength = 1.0f; + } + + // Stable Diffusion pipeline + // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline + + const auto& unet_config = m_unet->get_config(); + const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); + + if (generation_config.height < 0) + generation_config.height = unet_config.sample_size * vae_scale_factor; + if (generation_config.width < 0) + generation_config.width = unet_config.sample_size * vae_scale_factor; + check_inputs(generation_config, initial_image); + + m_clip_text_encoder->set_adapters(generation_config.adapters); + m_clip_text_encoder_with_projection->set_adapters(generation_config.adapters); + m_unet->set_adapters(generation_config.adapters); + + if (generation_config.generator == nullptr) { + uint32_t seed = time(NULL); + generation_config.generator = std::make_shared(seed); + } m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength); std::vector timesteps = m_scheduler->get_timesteps(); + // compute text encoders and set hidden states + compute_hidden_states(positive_prompt, generation_config); + // preparate initial latents ov::Tensor latent = prepare_latents(initial_image, generation_config); + // prepare latents passed to models taking into account guidance scale (batch size multipler) ov::Shape latent_shape_cfg = latent.get_shape(); latent_shape_cfg[0] *= batch_size_multiplier; ov::Tensor latent_cfg(ov::element::f32, latent_shape_cfg); @@ -415,10 +422,10 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); for (size_t inference_step = 0; inference_step < timesteps.size(); inference_step++) { - batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); + numpy_utils::batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); // concat the same latent twice along a batch dimension in case of CFG if (batch_size_multiplier > 1) { - batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); + numpy_utils::batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); } m_scheduler->scale_model_input(latent_cfg, inference_step); @@ -459,7 +466,7 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { } } - return m_vae->decode(denoised); + return decode(denoised); } ov::Tensor decode(const ov::Tensor latent) override {