From aef15918afc952d77f393829561e76fd2d8b538d Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 11 Dec 2024 12:36:20 +0400 Subject: [PATCH] [Image generation] Image2image and inpainting pipelines (#1334) - Added common infra for image to image and inpainting pipelines - Reused it in SD / LCM / SDXL pipelines - Generalized SD pipeline and inherit SDXL pipeline from it. SDXL overrides only SDXL specific blocks which work with second text encoder - Added Python API for Image2Image and Inpainting pipelines - Added Python and C++ samples --- .github/workflows/causal_lm_cpp.yml | 2 +- .github/workflows/lcm_dreamshaper_cpp.yml | 56 +++- .../workflows/stable_diffusion_1_5_cpp.yml | 36 +-- README.md | 139 +++++++-- samples/CMakeLists.txt | 6 +- .../512x512.bmp | 0 samples/cpp/image_generation/CMakeLists.txt | 99 +++++++ .../README.md | 63 +++- .../baseline.bmp | 0 .../heterogeneous_stable_diffusion.cpp | 0 samples/cpp/image_generation/image2image.cpp | 36 +++ samples/cpp/image_generation/imageimage.bmp | 3 + .../imwrite.cpp | 0 .../imwrite.hpp | 0 samples/cpp/image_generation/inpainting.cpp | 35 +++ samples/cpp/image_generation/load_image.cpp | 48 +++ samples/cpp/image_generation/load_image.hpp | 12 + .../{text2image => image_generation}/lora.bmp | 0 .../lora_text2image.cpp} | 0 .../text2image.cpp} | 2 +- samples/cpp/text2image/CMakeLists.txt | 66 ----- .../cpp/visual_language_chat/load_image.cpp | 6 +- samples/python/image_generation/README.md | 141 +++++++++ .../baseline.bmp | 0 .../heterogeneous_stable_diffusion.py | 0 .../python/image_generation/image2image.py | 38 +++ samples/python/image_generation/inpainting.py | 38 +++ .../{text2image => image_generation}/lora.bmp | 0 .../lora_text2image.py} | 0 .../text2image.bmp} | 0 .../text2image.py} | 2 +- samples/python/text2image/README.md | 98 ------ .../genai/image_generation/autoencoder_kl.hpp | 1 - .../image_generation/generation_config.hpp | 151 +++++++++- .../image_generation/image2image_pipeline.hpp | 105 +++++++ .../image_generation/inpainting_pipeline.hpp | 106 +++++++ .../image_generation/text2image_pipeline.hpp | 128 +++++++- .../image_generation/diffusion_pipeline.hpp | 43 ++- .../src/image_generation/flux_pipeline.hpp | 102 +++++-- .../image_generation/image2image_pipeline.cpp | 117 ++++++++ .../src/image_generation/image_processor.cpp | 169 +++++++++++ .../src/image_generation/image_processor.hpp | 50 ++++ .../image_generation/inpainting_pipeline.cpp | 122 ++++++++ .../models/autoencoder_kl.cpp | 27 +- src/cpp/src/image_generation/numpy_utils.cpp | 123 ++------ .../src/image_generation/schedulers/ddim.cpp | 11 +- .../src/image_generation/schedulers/ddim.hpp | 2 +- .../schedulers/euler_discrete.cpp | 9 +- .../schedulers/euler_discrete.hpp | 2 +- .../schedulers/flow_match_euler_discrete.cpp | 2 +- .../schedulers/flow_match_euler_discrete.hpp | 2 +- .../schedulers/ischeduler.hpp | 2 +- .../src/image_generation/schedulers/lcm.cpp | 10 +- .../src/image_generation/schedulers/lcm.hpp | 2 +- .../schedulers/lms_discrete.cpp | 2 +- .../schedulers/lms_discrete.hpp | 2 +- .../stable_diffusion_3_pipeline.hpp | 96 ++++-- .../stable_diffusion_pipeline.hpp | 227 ++++++++++---- .../stable_diffusion_xl_pipeline.hpp | 278 +++--------------- .../image_generation/text2image_pipeline.cpp | 16 +- src/docs/SUPPORTED_MODELS.md | 47 ++- src/python/openvino_genai/__init__.py | 2 + src/python/openvino_genai/__init__.pyi | 4 +- .../openvino_genai/py_openvino_genai.pyi | 151 +++++++++- src/python/py_image_generation_pipelines.cpp | 145 ++++++++- 65 files changed, 2412 insertions(+), 770 deletions(-) rename samples/cpp/{text2image => image_generation}/512x512.bmp (100%) create mode 100644 samples/cpp/image_generation/CMakeLists.txt rename samples/cpp/{text2image => image_generation}/README.md (60%) rename samples/cpp/{text2image => image_generation}/baseline.bmp (100%) rename samples/cpp/{text2image => image_generation}/heterogeneous_stable_diffusion.cpp (100%) create mode 100644 samples/cpp/image_generation/image2image.cpp create mode 100644 samples/cpp/image_generation/imageimage.bmp rename samples/cpp/{text2image => image_generation}/imwrite.cpp (100%) rename samples/cpp/{text2image => image_generation}/imwrite.hpp (100%) create mode 100644 samples/cpp/image_generation/inpainting.cpp create mode 100644 samples/cpp/image_generation/load_image.cpp create mode 100644 samples/cpp/image_generation/load_image.hpp rename samples/cpp/{text2image => image_generation}/lora.bmp (100%) rename samples/cpp/{text2image/lora.cpp => image_generation/lora_text2image.cpp} (100%) rename samples/cpp/{text2image/main.cpp => image_generation/text2image.cpp} (93%) delete mode 100644 samples/cpp/text2image/CMakeLists.txt create mode 100644 samples/python/image_generation/README.md rename samples/python/{text2image => image_generation}/baseline.bmp (100%) rename samples/python/{text2image => image_generation}/heterogeneous_stable_diffusion.py (100%) create mode 100644 samples/python/image_generation/image2image.py create mode 100644 samples/python/image_generation/inpainting.py rename samples/python/{text2image => image_generation}/lora.bmp (100%) rename samples/python/{text2image/lora.py => image_generation/lora_text2image.py} (100%) rename samples/python/{text2image/image.bmp => image_generation/text2image.bmp} (100%) rename samples/python/{text2image/main.py => image_generation/text2image.py} (99%) delete mode 100644 samples/python/text2image/README.md create mode 100644 src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp create mode 100644 src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp create mode 100644 src/cpp/src/image_generation/image2image_pipeline.cpp create mode 100644 src/cpp/src/image_generation/image_processor.cpp create mode 100644 src/cpp/src/image_generation/image_processor.hpp create mode 100644 src/cpp/src/image_generation/inpainting_pipeline.cpp diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 1d1486c385..504e303fb5 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -591,7 +591,7 @@ jobs: PYTHONPATH: "./build" cpp-greedy_causal_lm-redpajama-3b-chat: - runs-on: ubuntu-20.04-4-cores + runs-on: ubuntu-20.04-8-cores defaults: run: shell: bash diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index 63663e633c..b3a36761e1 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -47,11 +47,11 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} cache: 'pip' - - name: Build app + - name: Build apps run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ${{ env.build_dir }} - cmake --build ${{ env.build_dir }} --config Release --target stable_diffusion heterogeneous_stable_diffusion lora_stable_diffusion py_openvino_genai --parallel + cmake --build ${{ env.build_dir }} --config Release --target text2image image2image inpainting heterogeneous_stable_diffusion lora_text2image py_openvino_genai --parallel - name: Create virtual environment run: python3 -m venv openvino_lcm_cpp @@ -66,20 +66,32 @@ jobs: run: | source openvino_lcm_cpp/bin/activate optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16 + wget -O ./image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png + wget -O ./mask_image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png - - name: Run app + - name: Run heterogeneous_stable_diffusion run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - ${{ env.build_dir }}/samples/cpp/text2image/heterogeneous_stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + ${{ env.build_dir }}/samples/cpp/image_generation/heterogeneous_stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" - - name: Run Python app + - name: Run heterogeneous_stable_diffusion.py run: | source openvino_lcm_cpp/bin/activate source ./ov/setupvars.sh - python ./samples/python/text2image/heterogeneous_stable_diffusion.py ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + python ./samples/python/image_generation/heterogeneous_stable_diffusion.py ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" env: PYTHONPATH: ${{ env.build_dir }} + - name: Run image2image + run: | + source ./ov/setupvars.sh + ${{ env.build_dir }}/samples/cpp/image_generation/image2image ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png + + - name: Run inpainting + run: | + source ./ov/setupvars.sh + ${{ env.build_dir }}/samples/cpp/image_generation/inpainting ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" ./image.png ./mask_image.png + lcm_dreamshaper_v7_cpp-windows: runs-on: windows-2019 defaults: @@ -110,11 +122,11 @@ jobs: - name: Create virtual environment run: python -m venv openvino_lcm_cpp - - name: Build app + - name: Build apps run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ${{ env.build_dir }} - cmake --build ${{ env.build_dir }} --config Release --target stable_diffusion heterogeneous_stable_diffusion lora_stable_diffusion py_openvino_genai --parallel + cmake --build ${{ env.build_dir }} --config Release --target text2image image2image inpainting heterogeneous_stable_diffusion lora_text2image py_openvino_genai --parallel - name: Install python dependencies run: | @@ -126,18 +138,38 @@ jobs: run: | . "./openvino_lcm_cpp/Scripts/Activate.ps1" optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16 + Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png' -OutFile 'image.png' + Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png' -OutFile 'mask_image.png' - - name: Run app + - name: Run heterogeneous_stable_diffusion run: > . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - & "${{ env.build_dir }}/samples/cpp/text2image/Release/heterogeneous_stable_diffusion.exe ./models/lcm_dreamshaper_v7/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" + & "${{ env.build_dir }}/samples/cpp/image_generation/Release/heterogeneous_stable_diffusion.exe ./models/lcm_dreamshaper_v7/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" + + - name: Run heterogeneous_stable_diffusion.py + run: | + . "./openvino_lcm_cpp/Scripts/Activate.ps1" + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + $env:Path += "${{ env.build_dir }}\openvino_genai" + python .\samples\python\image_generation\heterogeneous_stable_diffusion.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + env: + PYTHONPATH: ${{ env.build_dir }} + + - name: Run image2image.py + run: | + . "./openvino_lcm_cpp/Scripts/Activate.ps1" + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + $env:Path += "${{ env.build_dir }}\openvino_genai" + python .\samples\python\image_generation\image2image.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png + env: + PYTHONPATH: ${{ env.build_dir }} - - name: Run Python app + - name: Run inpainting.py run: | . "./openvino_lcm_cpp/Scripts/Activate.ps1" . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" $env:Path += "${{ env.build_dir }}\openvino_genai" - python .\samples\python\text2image\heterogeneous_stable_diffusion.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + python .\samples\python\image_generation\inpainting.py .\models\lcm_dreamshaper_v7\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" .\image.png .\mask_image.png env: PYTHONPATH: ${{ env.build_dir }} diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index 602f2ae2c8..b355cd4f09 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -51,7 +51,7 @@ jobs: run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ${{ env.build_dir }} - cmake --build ${{ env.build_dir }} --config Release --target stable_diffusion lora_stable_diffusion py_openvino_genai --parallel + cmake --build ${{ env.build_dir }} --config Release --target text2image image2image inpainting heterogeneous_stable_diffusion lora_text2image py_openvino_genai --parallel - name: Create virtual environment run: python3 -m venv openvino_sd_cpp @@ -68,29 +68,29 @@ jobs: optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format fp16 --task stable-diffusion models/dreamlike-art-dreamlike-anime-1.0/FP16 wget -O ./models/soulcard.safetensors https://civitai.com/api/download/models/72591 - - name: Run main app + - name: Run text2image app run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - ${{ env.build_dir }}/samples/cpp/text2image/stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + ${{ env.build_dir }}/samples/cpp/image_generation/text2image ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" - - name: Run LoRA app + - name: Run lora_text2image app run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - ${{ env.build_dir }}/samples/cpp/text2image/lora_stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7 + ${{ env.build_dir }}/samples/cpp/image_generation/lora_text2image ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7 - - name: Run Python main app + - name: Run text2image.py app run: | source openvino_sd_cpp/bin/activate source ./ov/setupvars.sh - python ./samples/python/text2image/main.py ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + python ./samples/python/image_generation/text2image.py ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" env: PYTHONPATH: ${{ env.build_dir }} - - name: Run Python LoRA app + - name: Run lora_text2image.py app run: | source openvino_sd_cpp/bin/activate source ./ov/setupvars.sh - python ./samples/python/text2image/lora.py ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7 + python ./samples/python/image_generation/lora_text2image.py ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7 env: PYTHONPATH: ${{ env.build_dir }} @@ -125,7 +125,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ${{ env.build_dir }} - cmake --build ${{ env.build_dir }} --config Release --target stable_diffusion lora_stable_diffusion py_openvino_genai --parallel + cmake --build ${{ env.build_dir }} --config Release --target text2image image2image inpainting heterogeneous_stable_diffusion lora_text2image py_openvino_genai --parallel - name: Create virtual environment run: python -m venv openvino_sd_cpp @@ -142,35 +142,35 @@ jobs: optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike-art-dreamlike-anime-1.0/FP16 Invoke-WebRequest -Uri 'https://civitai.com/api/download/models/72591' -OutFile 'models/soulcard.safetensors' - - name: Run main app + - name: Run text2image app run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - "${{ env.build_dir }}/samples/cpp/text2image/Release/stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" + "${{ env.build_dir }}/samples/cpp/image_generation/Release/text2image.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" env: PATH: ${{ env.build_dir }}\openvino_genai - - name: Run LoRA app + - name: Run lora_text2image app run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - "${{ env.build_dir }}/samples/cpp/text2image/Release/lora_stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'curly-haired unicorn in the forest, anime, line' ./models/soulcard.safetensors 0.7" + "${{ env.build_dir }}/samples/cpp/image_generation/Release/lora_text2image.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'curly-haired unicorn in the forest, anime, line' ./models/soulcard.safetensors 0.7" env: PATH: ${{ env.build_dir }}\openvino_genai - - name: Run Python main app + - name: Run text2image.py app run: | . "./openvino_sd_cpp/Scripts/Activate.ps1" . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" $env:Path += "${{ env.build_dir }}\openvino_genai" - python .\samples\python\text2image\main.py .\models\dreamlike-art-dreamlike-anime-1.0\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + python .\samples\python\image_generation\text2image.py .\models\dreamlike-art-dreamlike-anime-1.0\FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" env: PYTHONPATH: ${{ env.build_dir }} - - name: Run Python LoRA app + - name: Run lora_text2image.py app run: | . "./openvino_sd_cpp/Scripts/Activate.ps1" . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" $env:Path += "${{ env.build_dir }}\openvino_genai" - python .\samples\python\text2image\lora.py .\models\dreamlike-art-dreamlike-anime-1.0\FP16 "curly-haired unicorn in the forest, anime, line" .\models\soulcard.safetensors 0.7 + python .\samples\python\image_generation\lora_text2image.py .\models\dreamlike-art-dreamlike-anime-1.0\FP16 "curly-haired unicorn in the forest, anime, line" .\models\soulcard.safetensors 0.7 env: PYTHONPATH: ${{ env.build_dir }} diff --git a/README.md b/README.md index c1217a0215..c00971a4e3 100644 --- a/README.md +++ b/README.md @@ -147,8 +147,8 @@ print(pipe.generate(prompt, image=image_data, max_new_tokens=100)) Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details). See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/visual_language_chat) for a demo application. ```cpp +#include "openvino/genai/visual_language/pipeline.hpp" #include "load_image.hpp" -#include #include int main(int argc, char* argv[]) { @@ -179,10 +179,10 @@ For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2 ```sh #Download and convert to OpenVINO dreamlike-anime-1.0 model -optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 +optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format fp16 dreamlike_anime_1_0_ov/FP16 #You can also use INT8 hybrid quantization to further optimize the model and reduce inference latency -optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format int8 --dataset conceptual_captions dreamlike_anime_1_0_ov/INT8 +optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format int8 --dataset conceptual_captions dreamlike_anime_1_0_ov/INT8 ``` ### Run generation using Text2Image API in Python @@ -192,23 +192,17 @@ import argparse from PIL import Image import openvino_genai -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('model_dir') - parser.add_argument('prompt') - args = parser.parse_args() - - device = 'CPU' # GPU, NPU can be used as well - pipe = openvino_genai.Text2ImagePipeline(args.model_dir, device) - image_tensor = pipe.generate( - args.prompt, - width=512, - height=512, - num_inference_steps=20 - ) - - image = Image.fromarray(image_tensor.data[0]) - image.save("image.bmp") +device = 'CPU' # GPU can be used as well +pipe = openvino_genai.Text2ImagePipeline("./dreamlike_anime_1_0_ov/INT8", device) +image_tensor = pipe.generate( + "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting", + width=512, + height=512, + num_inference_steps=20 +) + +image = Image.fromarray(image_tensor.data[0]) +image.save("image.bmp") ``` ### Run generation using Text2Image API in C++ @@ -218,10 +212,10 @@ Code below requires installation of C++ compatible package (see [here](https://d ```cpp #include "openvino/genai/image_generation/text2image_pipeline.hpp" #include "imwrite.hpp" -int main(int argc, char* argv[]) { +int main(int argc, char* argv[]) { const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well + const std::string device = "CPU"; // GPU can be used as well ov::genai::Text2ImagePipeline pipe(models_path, device); ov::Tensor image = pipe.generate(prompt, @@ -232,6 +226,106 @@ int main(int argc, char* argv[]) { imwrite("image.bmp", image, true); } ``` + +### Run generation using Image2Image API in Python + +```python +import argparse +from PIL import Image +import openvino_genai +import openvino as ov + +device = 'CPU' # GPU can be used as well +pipe = openvino_genai.Image2ImagePipeline("./dreamlike_anime_1_0_ov/INT8", device) + +image = Image.open("small_city.jpg") +image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8) +image_data = ov.Tensor(image_data) + +image_tensor = pipe.generate( + "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting", + image=image_data, + strength=0.8 +) + +image = Image.fromarray(image_tensor.data[0]) +image.save("image.bmp") +``` + +### Run generation using Image2Image API in C++ + +Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for additional setup details, or this blog for full instruction [How to Build OpenVINO™ GenAI APP in C++](https://medium.com/openvino-toolkit/how-to-build-openvino-genai-app-in-c-32dcbe42fa67) + +```cpp +#include "openvino/genai/image_generation/image2image_pipeline.hpp" +#include "load_image.hpp" +#include "imwrite.hpp" + +int main(int argc, char* argv[]) { + const std::string models_path = argv[1], prompt = argv[2], image_path = argv[3]; + const std::string device = "CPU"; // GPU can be used as well + + ov::Tensor image = utils::load_image(image_path); + + ov::genai::Image2ImagePipeline pipe(models_path, device); + ov::Tensor generated_image = pipe.generate(prompt, image, ov::genai::strength(0.8f)); + + imwrite("image.bmp", generated_image, true); +} +``` + +### Run generation using Inpainting API in Python + +```python +import argparse +from PIL import Image +import openvino_genai +import openvino as ov + +def read_image(path: str) -> openvino.Tensor: + pic = Image.open(path).convert("RGB") + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) + return openvino.Tensor(image_data) + +device = 'CPU' # GPU can be used as well +pipe = openvino_genai.InpaintingPipeline(args.model_dir, device) + +image = read_image("image.jpg") +mask_image = read_image("mask.jpg") + +image_tensor = pipe.generate( + "Face of a yellow cat, high resolution, sitting on a park bench", + image=image, + mask_image=mask_image +) + +image = Image.fromarray(image_tensor.data[0]) +image.save("image.bmp") +``` + +### Run generation using Inpainting API in C++ + +Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for additional setup details, or this blog for full instruction [How to Build OpenVINO™ GenAI APP in C++](https://medium.com/openvino-toolkit/how-to-build-openvino-genai-app-in-c-32dcbe42fa67) + +```cpp +#include "openvino/genai/image_generation/inpainting_pipeline.hpp" +#include "load_image.hpp" +#include "imwrite.hpp" + +int main(int argc, char* argv[]) { + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU can be used as well + + ov::Tensor image = utils::load_image(argv[3]); + ov::Tensor mask_image = utils::load_image(argv[4]); + + ov::genai::InpaintingPipeline pipe(models_path, device); + ov::Tensor generated_image = pipe.generate(prompt, image, mask_image); + + imwrite("image.bmp", generated_image, true); +} +``` + ### Sample notebooks using this API See [here](https://openvinotoolkit.github.io/openvino_notebooks/?search=Text+to+Image+pipeline+and+OpenVINO+with+Generate+API) @@ -259,7 +353,6 @@ NOTE: This sample is a simplified version of the full sample that is available [ import openvino_genai import librosa - def read_wav(filepath): raw_speech, samplerate = librosa.load(filepath, sr=16000) return raw_speech.tolist() diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 860ced072b..92f0b3f43a 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -10,7 +10,7 @@ add_subdirectory(cpp/lora_greedy_causal_lm) add_subdirectory(cpp/multinomial_causal_lm) add_subdirectory(cpp/prompt_lookup_decoding_lm) add_subdirectory(cpp/speculative_decoding_lm) -add_subdirectory(cpp/text2image) +add_subdirectory(cpp/image_generation) add_subdirectory(cpp/visual_language_chat) add_subdirectory(cpp/whisper_speech_recognition) @@ -26,11 +26,11 @@ install(DIRECTORY cpp/benchmark_genai cpp/chat_sample cpp/greedy_causal_lm + cpp/image_generation cpp/lora_greedy_causal_lm cpp/multinomial_causal_lm # Don't install prompt_lookup_decoding_lm because it doesn't use openvino_genai library and is not verified yet. cpp/speculative_decoding_lm - cpp/text2image cpp/visual_language_chat cpp/whisper_speech_recognition DESTINATION samples/cpp COMPONENT cpp_samples_genai) @@ -40,9 +40,9 @@ install(DIRECTORY python/benchmark_genai python/chat_sample python/greedy_causal_lm + python/image_generation python/multinomial_causal_lm python/speculative_decoding_lm - python/text2image python/visual_language_chat python/whisper_speech_recognition DESTINATION samples/python COMPONENT cpp_samples_genai diff --git a/samples/cpp/text2image/512x512.bmp b/samples/cpp/image_generation/512x512.bmp similarity index 100% rename from samples/cpp/text2image/512x512.bmp rename to samples/cpp/image_generation/512x512.bmp diff --git a/samples/cpp/image_generation/CMakeLists.txt b/samples/cpp/image_generation/CMakeLists.txt new file mode 100644 index 0000000000..004b305088 --- /dev/null +++ b/samples/cpp/image_generation/CMakeLists.txt @@ -0,0 +1,99 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +file(DOWNLOAD https://raw.githubusercontent.com/nothings/stb/f75e8d1cad7d90d72ef7a4661f1b994ef78b4e31/stb_image.h ${CMAKE_BINARY_DIR}/stb_image.h + EXPECTED_HASH MD5=27932e6fb3a2f26aee2fc33f2cb4e696) + +# create main sample executable + +add_executable(text2image text2image.cpp imwrite.cpp) + +target_include_directories(text2image PRIVATE ${CMAKE_BINARY_DIR} "${CMAKE_CURRENT_SOURCE_DIR}") +target_link_libraries(text2image PRIVATE openvino::genai) + +set_target_properties(text2image PROPERTIES + COMPILE_PDB_NAME text2image + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS text2image + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) + +# create LoRA sample executable + +add_executable(lora_text2image lora_text2image.cpp imwrite.cpp) + +target_include_directories(lora_text2image PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") +target_link_libraries(lora_text2image PRIVATE openvino::genai) + +set_target_properties(lora_text2image PROPERTIES + COMPILE_PDB_NAME lora_text2image + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS lora_text2image + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) + +# create heterogeneous_stable_diffusion sample executable + +add_executable(heterogeneous_stable_diffusion + heterogeneous_stable_diffusion.cpp + imwrite.cpp) + +target_include_directories(heterogeneous_stable_diffusion PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") +target_link_libraries(heterogeneous_stable_diffusion PRIVATE openvino::genai) + +set_target_properties(heterogeneous_stable_diffusion PROPERTIES + COMPILE_PDB_NAME heterogeneous_stable_diffusion + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS heterogeneous_stable_diffusion + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) + +# create image2image sample executable + +add_executable(image2image image2image.cpp load_image.cpp imwrite.cpp) + +target_include_directories(image2image PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}") +target_link_libraries(image2image PRIVATE openvino::genai) + +set_target_properties(image2image PROPERTIES + COMPILE_PDB_NAME image2image + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS image2image + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) + +# create LoRA sample executable + +add_executable(inpainting inpainting.cpp load_image.cpp imwrite.cpp) + +target_include_directories(inpainting PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}") +target_link_libraries(inpainting PRIVATE openvino::genai) + +set_target_properties(inpainting PROPERTIES + COMPILE_PDB_NAME inpainting + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS inpainting + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) \ No newline at end of file diff --git a/samples/cpp/text2image/README.md b/samples/cpp/image_generation/README.md similarity index 60% rename from samples/cpp/text2image/README.md rename to samples/cpp/image_generation/README.md index ac736b2383..795bea8999 100644 --- a/samples/cpp/text2image/README.md +++ b/samples/cpp/image_generation/README.md @@ -2,10 +2,12 @@ Examples in this folder showcase inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::Text2ImagePipeline` and uses a text prompt as input source. -There are three sample files: - - [`main.cpp`](./main.cpp) demonstrates basic usage of the text to image pipeline - - [`lora.cpp`](./lora.cpp) shows how to apply LoRA adapters to the pipeline +There are several sample files: + - [`text2image.cpp`](./main.cpp) demonstrates basic usage of the text to image pipeline + - [`text2image_lora.cpp`](./lora.cpp) shows how to apply LoRA adapters to the pipeline - [`heterogeneous_stable_diffusion.cpp`](./heterogeneous_stable_diffusion.cpp) shows how to assemble a heterogeneous txt2image pipeline from individual subcomponents (scheduler, text encoder, unet, vae decoder) + - [`image2image.cpp`](./image2image.cpp) demonstrates basic usage of the image to image pipeline + - [`inpainting.cpp`](./inpainting.cpp) demonstrates basic usage of the inpainting pipeline Users can change the sample code and play with the following generation parameters: @@ -13,8 +15,10 @@ Users can change the sample code and play with the following generation paramete - Generate multiple images per prompt - Adjust a number of inference steps - Play with [guidance scale](https://huggingface.co/spaces/stabilityai/stable-diffusion/discussions/9) (read [more details](https://arxiv.org/abs/2207.12598)) -- (SD 1.x, 2.x only) Add negative prompt when guidance scale > 1 +- (SD 1.x, 2.x; SD3, SDXL) Add negative prompt when guidance scale > 1 +- (SDXL, SD3, FLUX) Specify other positive prompts like `prompt_2` - Apply multiple different LoRA adapters and mix them with different blending coefficients +- (Image to image and inpainting) Play with `strength` parameter to control how initial image is noised and reduce number of inference steps ## Download and convert the models and tokenizers @@ -27,7 +31,7 @@ pip install --upgrade-strategy eager -r ../../requirements.txt optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 ``` -## Run +## Run text to image Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. @@ -39,7 +43,7 @@ Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk gol ![](./512x512.bmp) -## Run with callback +### Run with callback You can also add a callback to the `main.cpp` file to interrupt the image generation process earlier if you are satisfied with the intermediate result of the image generation or to add logs. @@ -57,7 +61,7 @@ auto callback = [&](size_t step, ov::Tensor& intermediate_res) -> bool { }; ov::Tensor image = pipe.generate(prompt, - ... + /* other generation properties */ ov::genai::callback(callback) ); ``` @@ -91,16 +95,55 @@ With adapter | Without adapter C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method. -## Run with multiple devices +## Run text to image with multiple devices The `heterogeneous_stable_diffusion` sample demonstrates how a Text2ImagePipeline object can be created from individual subcomponents - scheduler, text encoder, unet, & vae decoder. This approach gives fine-grained control over the devices used to execute each stage of the stable diffusion pipeline. The usage of this sample is: -`heterogeneous_stable_diffusion '' [ ]` +`./heterogeneous_stable_diffusion '' [ ]` For example: -`heterogeneous_stable_diffusion ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting' CPU NPU GPU` +`./heterogeneous_stable_diffusion ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting' CPU NPU GPU` The sample will create a stable diffusion pipeline such that the text encoder is executed on the CPU, UNet on the NPU, and VAE decoder on the GPU. + +## Run image to image pipeline + +The `image2mage.cpp` sample demonstrates basic image to image generation pipeline. The difference with text to image pipeline is that final image is denoised from initial image converted to latent space and noised with image noise according to `strength` parameter. `strength` should be in range of `[0., 1.]` where `1.` means initial image is fully noised and it is an equivalent to text to image generation. +Also, `strength` parameter linearly affects a number of inferenece steps, because lower `strength` values means initial latent already has some structure and it requires less steps to denoise it. + +To run the sample, download initial image first: + +`wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png` + +And then run the sample: + +`./image2mage ./dreamlike_anime_1_0_ov/FP16 'cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k' cat.png` + +The resuling image is: + + ![](./imageimage.bmp) + +Note, that LoRA, heterogeneous execution and other features of `Text2ImagePipeline` are applicable for `Image2ImagePipeline`. + +## Run inpainting pipeline + +The `inpainting.cpp` sample demonstrates usage of inpainting pipeline, which can inpaint initial image by a given mask. Inpainting pipeline can work on typical text to image models as well as on specialized models which are oftenly named `space/model-inpainting`, e.g. `stabilityai/stable-diffusion-2-inpainting`. + +Such models can be converted in the same way as regular ones via `optimum-cli`: + +`optimum-cli export openvino --model stabilityai/stable-diffusion-2-inpainting --weight-format fp16 stable-diffusion-2-inpainting` + +Let's also download input data: + +`wget -O image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png` + +`wget -O mask_image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png` + +And run the sample: + +`./inpainting ./stable-diffusion-2-inpainting 'Face of a yellow cat, high resolution, sitting on a park bench' image.png mask_image.png` + +Note, that LoRA, heterogeneous execution and other features of `Text2ImagePipeline` are applicable for `InpaintingPipeline`. diff --git a/samples/cpp/text2image/baseline.bmp b/samples/cpp/image_generation/baseline.bmp similarity index 100% rename from samples/cpp/text2image/baseline.bmp rename to samples/cpp/image_generation/baseline.bmp diff --git a/samples/cpp/text2image/heterogeneous_stable_diffusion.cpp b/samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp similarity index 100% rename from samples/cpp/text2image/heterogeneous_stable_diffusion.cpp rename to samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp diff --git a/samples/cpp/image_generation/image2image.cpp b/samples/cpp/image_generation/image2image.cpp new file mode 100644 index 0000000000..c071b88362 --- /dev/null +++ b/samples/cpp/image_generation/image2image.cpp @@ -0,0 +1,36 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/image_generation/image2image_pipeline.hpp" + +#include "load_image.hpp" +#include "imwrite.hpp" + +int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc == 4, "Usage: ", argv[0], " '' "); + + const std::string models_path = argv[1], prompt = argv[2], image_path = argv[3]; + const std::string device = "CPU"; // GPU can be used as well + + ov::Tensor image = utils::load_image(image_path); + + ov::genai::Image2ImagePipeline pipe(models_path, device); + ov::Tensor generated_image = pipe.generate(prompt, image, + // controls how initial image is noised after being converted to latent space. `1` means initial image is fully noised + ov::genai::strength(0.8f)); + + // writes `num_images_per_prompt` images by pattern name + imwrite("image_%d.bmp", generated_image, true); + + return EXIT_SUCCESS; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/image_generation/imageimage.bmp b/samples/cpp/image_generation/imageimage.bmp new file mode 100644 index 0000000000..0ed5e1799e --- /dev/null +++ b/samples/cpp/image_generation/imageimage.bmp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ecb4783a8f3a0962659ebf80eeaf0c0e48c44995c1e60001f215e0697ab9397 +size 2162742 diff --git a/samples/cpp/text2image/imwrite.cpp b/samples/cpp/image_generation/imwrite.cpp similarity index 100% rename from samples/cpp/text2image/imwrite.cpp rename to samples/cpp/image_generation/imwrite.cpp diff --git a/samples/cpp/text2image/imwrite.hpp b/samples/cpp/image_generation/imwrite.hpp similarity index 100% rename from samples/cpp/text2image/imwrite.hpp rename to samples/cpp/image_generation/imwrite.hpp diff --git a/samples/cpp/image_generation/inpainting.cpp b/samples/cpp/image_generation/inpainting.cpp new file mode 100644 index 0000000000..4c7a758450 --- /dev/null +++ b/samples/cpp/image_generation/inpainting.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/image_generation/inpainting_pipeline.hpp" + +#include "load_image.hpp" +#include "imwrite.hpp" + +int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc == 5, "Usage: ", argv[0], " '' "); + + const std::string models_path = argv[1], prompt = argv[2], image_path = argv[3], mask_image_path = argv[4]; + const std::string device = "CPU"; // GPU can be used as well + + ov::Tensor image = utils::load_image(image_path); + ov::Tensor mask_image = utils::load_image(mask_image_path); + + ov::genai::InpaintingPipeline pipe(models_path, device); + ov::Tensor generated_image = pipe.generate(prompt, image, mask_image); + + // writes `num_images_per_prompt` images by pattern name + imwrite("image_%d.bmp", generated_image, true); + + return EXIT_SUCCESS; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/image_generation/load_image.cpp b/samples/cpp/image_generation/load_image.cpp new file mode 100644 index 0000000000..7f14e54b3e --- /dev/null +++ b/samples/cpp/image_generation/load_image.cpp @@ -0,0 +1,48 @@ + +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include + +#define STB_IMAGE_IMPLEMENTATION + +#include "stb_image.h" +#include "load_image.hpp" + +namespace fs = std::filesystem; + +ov::Tensor utils::load_image(const std::filesystem::path& image_path) { + int x = 0, y = 0, channels_in_file = 0; + constexpr int desired_channels = 3; + unsigned char* image = stbi_load( + image_path.string().c_str(), + &x, &y, &channels_in_file, desired_channels); + if (!image) { + std::stringstream error_message; + error_message << "Failed to load the image '" << image_path << "'"; + throw std::runtime_error{error_message.str()}; + } + struct SharedImageAllocator { + unsigned char* image; + int channels, height, width; + void* allocate(size_t bytes, size_t) const { + if (image && channels * height * width == bytes) { + return image; + } + throw std::runtime_error{"Unexpected number of bytes was requested to allocate."}; + } + void deallocate(void*, size_t bytes, size_t) { + if (channels * height * width != bytes) { + throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."}; + } + stbi_image_free(image); + image = nullptr; + } + bool is_equal(const SharedImageAllocator& other) const noexcept {return this == &other;} + }; + return ov::Tensor( + ov::element::u8, + ov::Shape{1, size_t(y), size_t(x), size_t(desired_channels)}, + SharedImageAllocator{image, desired_channels, y, x} + ); +} diff --git a/samples/cpp/image_generation/load_image.hpp b/samples/cpp/image_generation/load_image.hpp new file mode 100644 index 0000000000..f66dd2caf2 --- /dev/null +++ b/samples/cpp/image_generation/load_image.hpp @@ -0,0 +1,12 @@ + +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +namespace utils { +ov::Tensor load_image(const std::filesystem::path& image_path); +} diff --git a/samples/cpp/text2image/lora.bmp b/samples/cpp/image_generation/lora.bmp similarity index 100% rename from samples/cpp/text2image/lora.bmp rename to samples/cpp/image_generation/lora.bmp diff --git a/samples/cpp/text2image/lora.cpp b/samples/cpp/image_generation/lora_text2image.cpp similarity index 100% rename from samples/cpp/text2image/lora.cpp rename to samples/cpp/image_generation/lora_text2image.cpp diff --git a/samples/cpp/text2image/main.cpp b/samples/cpp/image_generation/text2image.cpp similarity index 93% rename from samples/cpp/text2image/main.cpp rename to samples/cpp/image_generation/text2image.cpp index 5789e09884..6a97b3a074 100644 --- a/samples/cpp/text2image/main.cpp +++ b/samples/cpp/image_generation/text2image.cpp @@ -9,7 +9,7 @@ int32_t main(int32_t argc, char* argv[]) try { OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well + const std::string device = "CPU"; // GPU can be used as well ov::genai::Text2ImagePipeline pipe(models_path, device); ov::Tensor image = pipe.generate(prompt, diff --git a/samples/cpp/text2image/CMakeLists.txt b/samples/cpp/text2image/CMakeLists.txt deleted file mode 100644 index 42b994dd71..0000000000 --- a/samples/cpp/text2image/CMakeLists.txt +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -# create main sample executable - -add_executable(stable_diffusion - ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/imwrite.cpp) - -target_include_directories(stable_diffusion PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(stable_diffusion PRIVATE openvino::genai) - -set_target_properties(stable_diffusion PROPERTIES - COMPILE_PDB_NAME stable_diffusion - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS stable_diffusion - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) - -# create LoRA sample executable - -add_executable(lora_stable_diffusion - ${CMAKE_CURRENT_SOURCE_DIR}/lora.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/imwrite.cpp) - -target_include_directories(lora_stable_diffusion PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(lora_stable_diffusion PRIVATE openvino::genai) - -set_target_properties(lora_stable_diffusion PROPERTIES - COMPILE_PDB_NAME lora_stable_diffusion - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS lora_stable_diffusion - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) - -# create heterogeneous_stable_diffusion sample executable - -add_executable(heterogeneous_stable_diffusion - ${CMAKE_CURRENT_SOURCE_DIR}/heterogeneous_stable_diffusion.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/imwrite.cpp) - -target_include_directories(heterogeneous_stable_diffusion PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(heterogeneous_stable_diffusion PRIVATE openvino::genai) - -set_target_properties(heterogeneous_stable_diffusion PROPERTIES - COMPILE_PDB_NAME heterogeneous_stable_diffusion - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS heterogeneous_stable_diffusion - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/visual_language_chat/load_image.cpp b/samples/cpp/visual_language_chat/load_image.cpp index 8e2e377239..6dd246d647 100644 --- a/samples/cpp/visual_language_chat/load_image.cpp +++ b/samples/cpp/visual_language_chat/load_image.cpp @@ -2,6 +2,8 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include + #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" #include "load_image.hpp" @@ -30,7 +32,9 @@ ov::Tensor utils::load_image(const std::filesystem::path& image_path) { image_path.string().c_str(), &x, &y, &channels_in_file, desired_channels); if (!image) { - throw std::runtime_error{"Failed to load the image."}; + std::stringstream error_message; + error_message << "Failed to load the image '" << image_path << "'"; + throw std::runtime_error{error_message.str()}; } struct SharedImageAllocator { unsigned char* image; diff --git a/samples/python/image_generation/README.md b/samples/python/image_generation/README.md new file mode 100644 index 0000000000..4abe45b2b4 --- /dev/null +++ b/samples/python/image_generation/README.md @@ -0,0 +1,141 @@ +# Text to Image Python Generation Pipeline + +Examples in this folder showcase inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.Text2ImagePipeline` and uses a text prompt as input source. + +There are several sample files: + - [`text2image.py`](./text2image.py) demonstrates basic usage of the text to image pipeline + - [`lora_text2image.py`](./lora.py) shows how to apply LoRA adapters to the pipeline + - [`heterogeneous_stable_diffusion.py`](./heterogeneous_stable_diffusion.py) shows how to assemble a heterogeneous text2image pipeline from individual subcomponents (scheduler, text encoder, unet, vae decoder) + - [`image2image.py`](./image2image.py) demonstrates basic usage of the image to image pipeline + - [`inpainting.py`](./inpainting.py) demonstrates basic usage of the inpainting pipeline + +Users can change the sample code and play with the following generation parameters: + +- Change width or height of generated image +- Generate multiple images per prompt +- Adjust a number of inference steps +- Play with [guidance scale](https://huggingface.co/spaces/stabilityai/stable-diffusion/discussions/9) (read [more details](https://arxiv.org/abs/2207.12598)) +- (SD 1.x, 2.x; SD3, SDXL) Add negative prompt when guidance scale > 1 +- (SDXL, SD3, FLUX) Specify other positive prompts like `prompt_2` +- Apply multiple different LoRA adapters and mix them with different blending coefficients +- (Image to image and inpainting) Play with `strength` parameter to control how initial image is noised and reduce number of inference steps + +## Download and convert the models and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. + +```sh +pip install --upgrade-strategy eager -r ../../export-requirements.txt +optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 +``` + +## Run text to image + +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + +`python text2image.py ./dreamlike_anime_1_0_ov/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"` + +### Examples + +Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` + + ![](./text2image.bmp) + +### Run with callback + +You can also add a callback to the `text2image.py` file to interrupt the image generation process earlier if you are satisfied with the intermediate result of the image generation or to add logs. + +Please find the template of the callback usage below. + +```python +pipe = openvino_genai.Text2ImagePipeline(model_dir, device) + +def callback(step, intermediate_res): + print("Image generation step: ", step) + image_tensor = pipe.decode(intermediate_res) # get intermediate image tensor + if your_condition: # return True if you want to interrupt image generation + return True + return False + +image = pipe.generate( + ... + callback = callback +) +``` + +## Run with optional LoRA adapters + +LoRA adapters can be connected to the pipeline and modify generated images to have certain style, details or quality. Adapters are supported in Safetensors format and can be downloaded from public sources like [Civitai](https://civitai.com) or [HuggingFace](https://huggingface.co/models) or trained by the user. Adapters compatible with a base model should be used only. A weighted blend of multiple adapters can be applied by specifying multiple adapter files with corresponding alpha parameters in command line. Check `lora_text2image.py` source code to learn how to enable adapters and specify them in each `generate` call. + +Here is an example how to run the sample with a single adapter. First download adapter file from https://civitai.com/models/67927/soulcard page manually and save it as `soulcard.safetensors`. Or download it from command line: + +`wget -O soulcard.safetensors https://civitai.com/api/download/models/72591` + +Then run `lora_text2image.py`: + +`python lora_text2image.py ./dreamlike_anime_1_0_ov/FP16 "curly-haired unicorn in the forest, anime, line" soulcard.safetensors 0.7` + +The sample generates two images with and without adapters applied using the same prompt: + - `lora.bmp` with adapters applied + - `baseline.bmp` without adapters applied + +Check the difference: + +With adapter | Without adapter +:---:|:---: +![](./lora.bmp) | ![](./baseline.bmp) + +## Run text to image with multiple devices + +The `heterogeneous_stable_diffusion.py` sample demonstrates how a Text2ImagePipeline object can be created from individual subcomponents - scheduler, text encoder, unet, & vae decoder. This approach gives fine-grained control over the devices used to execute each stage of the stable diffusion pipeline. + +The usage of this sample is: + +`heterogeneous_stable_diffusion.py [-h] model_dir prompt [text_encoder_device] [unet_device] [vae_decoder_device]` + +For example: + +`python heterogeneous_stable_diffusion.py ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting' CPU NPU GPU` + +The sample will create a stable diffusion pipeline such that the text encoder is executed on the CPU, UNet on the NPU, and VAE decoder on the GPU. + +## Run image to image pipeline + +The `image2mage.py` sample demonstrates basic image to image generation pipeline. The difference with text to image pipeline is that final image is denoised from initial image converted to latent space and noised with image noise according to `strength` parameter. `strength` should be in range of `[0., 1.]` where `1.` means initial image is fully noised and it is an equivalent to text to image generation. +Also, `strength` parameter linearly affects a number of inferenece steps, because lower `strength` values means initial latent already has some structure and it requires less steps to denoise it. + +To run the sample, download initial image first: + +`wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png` + +And then run the sample: + +`python image2mage.py ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting' small_city.bmp` + +The resuling image is: + + ![](./../../cpp/image_generation/imageimage.bmp) + +Note, that LoRA, heterogeneous execution and other features of `Text2ImagePipeline` are applicable for `Image2ImagePipeline`. + +## Run inpainting pipeline + +The `inpainting.py` sample demonstrates usage of inpainting pipeline, which can inpaint initial image by a given mask. Inpainting pipeline can work on typical text to image models as well as on specialized models which are oftenly named `space/model-inpainting`, e.g. `stabilityai/stable-diffusion-2-inpainting`. + +Such models can be converted in the same way as regular ones via `optimum-cli`: + +`optimum-cli export openvino --model stabilityai/stable-diffusion-2-inpainting --weight-format fp16 stable-diffusion-2-inpainting` + +Let's also download input data: + +`wget -O image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png` + +`wget -O mask_image.png https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png` + +And run the sample: + +`python inpainting.py ./stable-diffusion-2-inpainting 'Face of a yellow cat, high resolution, sitting on a park bench' image.png mask_image.png` + +Note, that LoRA, heterogeneous execution and other features of `Text2ImagePipeline` are applicable for `InpaintingPipeline`. diff --git a/samples/python/text2image/baseline.bmp b/samples/python/image_generation/baseline.bmp similarity index 100% rename from samples/python/text2image/baseline.bmp rename to samples/python/image_generation/baseline.bmp diff --git a/samples/python/text2image/heterogeneous_stable_diffusion.py b/samples/python/image_generation/heterogeneous_stable_diffusion.py similarity index 100% rename from samples/python/text2image/heterogeneous_stable_diffusion.py rename to samples/python/image_generation/heterogeneous_stable_diffusion.py diff --git a/samples/python/image_generation/image2image.py b/samples/python/image_generation/image2image.py new file mode 100644 index 0000000000..bb452036bf --- /dev/null +++ b/samples/python/image_generation/image2image.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino +import openvino_genai +import numpy as np + +from PIL import Image + +def read_image(path: str) -> openvino.Tensor: + pic = Image.open(path).convert("RGB") + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) + return openvino.Tensor(image_data) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('prompt') + parser.add_argument('image') + args = parser.parse_args() + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.Image2ImagePipeline(args.model_dir, device) + + image = read_image(args.image) + + image_tensor = pipe.generate(args.prompt, image, + strength=0.8 # controls how initial image is noised after being converted to latent space. `1` means initial image is fully noised + ) + + image = Image.fromarray(image_tensor.data[0]) + image.save("image.bmp") + + +if '__main__' == __name__: + main() diff --git a/samples/python/image_generation/inpainting.py b/samples/python/image_generation/inpainting.py new file mode 100644 index 0000000000..47ecfd4495 --- /dev/null +++ b/samples/python/image_generation/inpainting.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino +import openvino_genai +import numpy as np + +from PIL import Image + +def read_image(path: str) -> openvino.Tensor: + pic = Image.open(path).convert("RGB") + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) + return openvino.Tensor(image_data) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('prompt') + parser.add_argument('image') + parser.add_argument('mask') + args = parser.parse_args() + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.InpaintingPipeline(args.model_dir, device) + + image = read_image(args.image) + mask_image = read_image(args.mask) + + image_tensor = pipe.generate(args.prompt, image, mask_image) + + image = Image.fromarray(image_tensor.data[0]) + image.save("image.bmp") + + +if '__main__' == __name__: + main() diff --git a/samples/python/text2image/lora.bmp b/samples/python/image_generation/lora.bmp similarity index 100% rename from samples/python/text2image/lora.bmp rename to samples/python/image_generation/lora.bmp diff --git a/samples/python/text2image/lora.py b/samples/python/image_generation/lora_text2image.py similarity index 100% rename from samples/python/text2image/lora.py rename to samples/python/image_generation/lora_text2image.py diff --git a/samples/python/text2image/image.bmp b/samples/python/image_generation/text2image.bmp similarity index 100% rename from samples/python/text2image/image.bmp rename to samples/python/image_generation/text2image.bmp diff --git a/samples/python/text2image/main.py b/samples/python/image_generation/text2image.py similarity index 99% rename from samples/python/text2image/main.py rename to samples/python/image_generation/text2image.py index fa9f339b30..95d8c68e82 100644 --- a/samples/python/text2image/main.py +++ b/samples/python/image_generation/text2image.py @@ -42,4 +42,4 @@ def main(): if '__main__' == __name__: - main() + main() \ No newline at end of file diff --git a/samples/python/text2image/README.md b/samples/python/text2image/README.md deleted file mode 100644 index 2e841673d3..0000000000 --- a/samples/python/text2image/README.md +++ /dev/null @@ -1,98 +0,0 @@ -# Text to Image Python Generation Pipeline - -Examples in this folder showcase inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.Text2ImagePipeline` and uses a text prompt as input source. - -There are three sample files: - - [`main.py`](./main.py) demonstrates basic usage of the text to image pipeline - - [`lora.py`](./lora.py) shows how to apply LoRA adapters to the pipeline - - [`heterogeneous_stable_diffusion.py`](./heterogeneous_stable_diffusion.py) shows how to assemble a heterogeneous txt2image pipeline from individual subcomponents (scheduler, text encoder, unet, vae decoder) - -Users can change the sample code and play with the following generation parameters: - -- Change width or height of generated image -- Generate multiple images per prompt -- Adjust a number of inference steps -- Play with [guidance scale](https://huggingface.co/spaces/stabilityai/stable-diffusion/discussions/9) (read [more details](https://arxiv.org/abs/2207.12598)) -- (SD 1.x, 2.x only) Add negative prompt when guidance scale > 1 -- Apply multiple different LoRA adapters and mix them with different blending coefficients - -## Download and convert the models and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python main.py ./dreamlike_anime_1_0_ov/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"` - -### Examples - -Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` - - ![](./image.bmp) - -## Run with callback - -You can also add a callback to the `main.py` file to interrupt the image generation process earlier if you are satisfied with the intermediate result of the image generation or to add logs. - -Please find the template of the callback usage below. - -```python -pipe = openvino_genai.Text2ImagePipeline(model_dir, device) - -def callback(step, intermediate_res): - print("Image generation step: ", step) - image_tensor = pipe.decode(intermediate_res) # get intermediate image tensor - if your_condition: # return True if you want to interrupt image generation - return True - return False - -image = pipe.generate( - ... - callback = callback -) -``` - -## Run with optional LoRA adapters - -LoRA adapters can be connected to the pipeline and modify generated images to have certain style, details or quality. Adapters are supported in Safetensors format and can be downloaded from public sources like [Civitai](https://civitai.com) or [HuggingFace](https://huggingface.co/models) or trained by the user. Adapters compatible with a base model should be used only. A weighted blend of multiple adapters can be applied by specifying multiple adapter files with corresponding alpha parameters in command line. Check `lora.cpp` source code to learn how to enable adapters and specify them in each `generate` call. - -Here is an example how to run the sample with a single adapter. First download adapter file from https://civitai.com/models/67927/soulcard page manually and save it as `soulcard.safetensors`. Or download it from command line: - -`wget -O soulcard.safetensors https://civitai.com/api/download/models/72591` - -Then run `lora.py`: - -`python lora.py ./dreamlike_anime_1_0_ov/FP16 "curly-haired unicorn in the forest, anime, line" soulcard.safetensors 0.7` - -The sample generates two images with and without adapters applied using the same prompt: - - `lora.bmp` with adapters applied - - `baseline.bmp` without adapters applied - -Check the difference: - -With adapter | Without adapter -:---:|:---: -![](./lora.bmp) | ![](./baseline.bmp) - -## Run with multiple devices - -The `heterogeneous_stable_diffusion.py` sample demonstrates how a Text2ImagePipeline object can be created from individual subcomponents - scheduler, text encoder, unet, & vae decoder. This approach gives fine-grained control over the devices used to execute each stage of the stable diffusion pipeline. - -The usage of this sample is: - -`heterogeneous_stable_diffusion.py [-h] model_dir prompt [text_encoder_device] [unet_device] [vae_decoder_device]` - -For example: - -`heterogeneous_stable_diffusion.py ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting' CPU NPU GPU` - -The sample will create a stable diffusion pipeline such that the text encoder is executed on the CPU, UNet on the NPU, and VAE decoder on the GPU. diff --git a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp index 347925727a..d48661d899 100644 --- a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp +++ b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp @@ -136,7 +136,6 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL { size_t get_vae_scale_factor() const; private: - void merge_vae_image_pre_processing() const; void merge_vae_image_post_processing() const; Config m_config; diff --git a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp index e798651580..0b749ecd83 100644 --- a/src/cpp/include/openvino/genai/image_generation/generation_config.hpp +++ b/src/cpp/include/openvino/genai/image_generation/generation_config.hpp @@ -20,58 +20,103 @@ namespace genai { // Random generators // +/** + * Base class to represent random generator used in Image generation pipelines + */ class OPENVINO_GENAI_EXPORTS Generator { public: + /** + * The function to return next random floating point value + * @returns Floating point value within a [0, 1] range + */ virtual float next() = 0; + + /** + * Generates a random tensor of floating point values with a given shape + * By default, it creates a tensor and fills it using 'Generator::next()' method element by element, + * but some random generator strategies have different pocilies how tensors are generated and this method + * provides an ability to change it. + */ virtual ov::Tensor randn_tensor(const ov::Shape& shape); + + /** + * Default dtor defined to ensure working RTTI. + */ virtual ~Generator(); }; +/** + * Implementation of 'Generator' using standard C++ random library types 'std::mt19937' and 'std::normal_distribution' + */ class OPENVINO_GENAI_EXPORTS CppStdGenerator : public Generator { public: - // creates 'std::mt19937' with initial 'seed' to generate numbers within a range [0.0f, 1.0f] + /** + * Initialized C++ STD generator with a given seed + * @param seed A seed value + */ explicit CppStdGenerator(uint32_t seed); virtual float next() override; + private: std::mt19937 gen; std::normal_distribution normal; }; +/** + * Generation config used for Image generation pipelines. + * Note, that not all values are applicable for all pipelines and models - please, refer + * to documentation of properties below to understand a meaning and applicability for specific models. + */ struct OPENVINO_GENAI_EXPORTS ImageGenerationConfig { - // LCM: prompt only w/o negative prompt - // SD XL: prompt2 and negative_prompt2 - // FLUX: prompt2 (prompt if prompt2 is not defined explicitly) - // SD 3: prompt2, prompt3 (with fallback to prompt) and negative_prompt2, negative_prompt3 + /** + * Prompts and negative prompts + */ std::optional prompt_2 = std::nullopt, prompt_3 = std::nullopt; std::optional negative_prompt = std::nullopt, negative_prompt_2 = std::nullopt, negative_prompt_3 = std::nullopt; + /** + * A number of images to generate per 'generate()' call + */ size_t num_images_per_prompt = 1; - // random generator to have deterministic results + /** + * Random generator to initial latents, add noise to initial images in case of image to image / inpainting pipelines + */ std::shared_ptr generator = std::make_shared(42); - // the following values depend on HF diffusers class used to perform generation float guidance_scale = 7.5f; int64_t height = -1; int64_t width = -1; size_t num_inference_steps = 50; - // the following value used by t5_encoder_model (Flux, SD3 pipelines) + /** + * Max sequence lenght for T4 encoder / tokenizer used in SD3 / FLUX models + */ int max_sequence_length = -1; - // used by some image to image pipelines to balance between noise and initial image - // higher 'stregth' value means more noise is added to initial latent image - // for text to image pipeline it must be set to 1.0f + /** + * Strength parameter used in Image to imaage / Inpainting pipelines. + * Must be 1.0 for text to image generation as no initial image is provided in such scenario. + */ float strength = 1.0f; + /** + * Holds LoRA adapters + */ std::optional adapters; - void update_generation_config(const ov::AnyMap& config_map); - - // checks whether is config is valid + /** + * Checks whether image generation config is valid, otherwise throws an exception. + */ void validate() const; + /** + * Updates generation config from a map of properties. + * @param properties A map of properties + */ + void update_generation_config(const ov::AnyMap& properties); + template ov::util::EnableIfAllStringAny update_generation_config(Properties&&... properties) { return update_generation_config(ov::AnyMap{std::forward(properties)...}); @@ -82,27 +127,105 @@ struct OPENVINO_GENAI_EXPORTS ImageGenerationConfig { // Generation config properties // +/** + * Prompt 2 for models which have at least two text encoders. Currently, it's used for SDXL, SD3, FLUX + */ static constexpr ov::Property prompt_2{"prompt_2"}; + +/** + * Prompt 3 for models which have three text encoders. Currently, it's used only for SD3 + */ static constexpr ov::Property prompt_3{"prompt_3"}; +/** + * Negative prompt for models which have negative prompt. Currently, it's used for SD, SDXL, SD3 + */ static constexpr ov::Property negative_prompt{"negative_prompt"}; + +/** + * Negative prompt 2 for models which have at least two text encoders. Currently, it's used for SDXL, SD3 + */ static constexpr ov::Property negative_prompt_2{"negative_prompt_2"}; + +/** + * Negative prompt 3 for models which have three text encoders. Currently, it's used only for SD3 + */ static constexpr ov::Property negative_prompt_3{"negative_prompt_3"}; +/** + * A number of images to generate per generate() call. If you want to generate multiple images + * for the same combination of generation parameters and text prompts, you can use this parameter + * for better performance as internally compuations will be performed with batch for Unet / Transformer models + * and text embeddings tensors will also be computed only once. + */ static constexpr ov::Property num_images_per_prompt{"num_images_per_prompt"}; + +/** + * Guidance scale parameter which controls how model sticks to text embeddings generated + * by text encoders within a pipeline. Higher value of guidance scale moves image generation towards + * text embeddings, but resulting image will be less natural and more augmented. + */ static constexpr ov::Property guidance_scale{"guidance_scale"}; + +/** + * Specifies a height of a resulting image. Typically, image height must be divisible by VAE scale factor + * (which is 8 in most of cases) which represents ratio between latent image / RGB image sizes. + */ static constexpr ov::Property height{"height"}; + +/** + * Specifies a width of a resulting image. Typically, image width must be divisible by VAE scale factor + * (which is 8 in most of cases) which represents ratio between latent image / RGB image sizes. + */ static constexpr ov::Property width{"width"}; + +/** + * Defines a number of inference steps used to denoise initial noised latent to final image. + * Note, that in case of image to image / inpainting pipelines, the resulting number of inference steps + * is scaled with 'strength' parameter. + */ static constexpr ov::Property num_inference_steps{"num_inference_steps"}; +/** + * Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + * starting point and more noise is added the higher the `strength`. The number of denoising steps depends + * on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + * process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + * essentially ignores `image`. + */ static constexpr ov::Property strength{"strength"}; +/** + * Overrides default random generator used within image generation pipelines. + * By default, 'CppStdGenerator' is used, but if you are running Image generation via + * python code, you can additionally install 'torch' and use OpenVINO GenAI's 'TorchGenerator' + * which ensures the generated images will look as in HuggingFace when the same sed value if used. + */ static constexpr ov::Property> generator{"generator"}; +/** + * This parameters limits max sequence length for T5 encoder for SD3 and FLUX models. + * T5 tokenizer output is padded with pad tokens to 'max_sequence_length' within a pipeline. + * So, for better performance, you can specify this parameter to lower value to speed-up + * T5 encoder inference as well as inference of transformer denoising model. + * For optimal performance it can be set to a number of tokens for 'prompt_3' / 'negative_prompt_3' for SD3 + * or `prompt_2` for FLUX. + * Note, that images generated with different values of 'max_sequence_length' are slightly different, but quite close. + */ static constexpr ov::Property max_sequence_length{"max_sequence_length"}; +/** + * User callback for image generation pipelines, which is called within a pipeline with the following arguments: + * - Total number of inference steps. Note, that in case of 'strength' parameter, the number of inference steps is reduced linearly + * - Current inference step + * - Tensor representing current latent. Such latent can be converted to human-readable representation via image generation pipeline 'decode()' method + */ static constexpr ov::Property> callback{"callback"}; +/** + * Function to pass 'ImageGenerationConfig' as property to 'generate()' call. + * @param generation_config An image generation config to convert to property-like format + */ OPENVINO_GENAI_EXPORTS std::pair generation_config(const ImageGenerationConfig& generation_config); diff --git a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp new file mode 100644 index 0000000000..a859b29c2e --- /dev/null +++ b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp @@ -0,0 +1,105 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +#include "openvino/core/any.hpp" +#include "openvino/runtime/tensor.hpp" + +#include "openvino/genai/image_generation/scheduler.hpp" +#include "openvino/genai/image_generation/generation_config.hpp" + +#include "openvino/genai/image_generation/clip_text_model.hpp" +#include "openvino/genai/image_generation/clip_text_model_with_projection.hpp" +#include "openvino/genai/image_generation/unet2d_condition_model.hpp" +#include "openvino/genai/image_generation/autoencoder_kl.hpp" + +namespace ov { +namespace genai { + +// forward declaration +class DiffusionPipeline; + +// +// Image to image pipeline +// + +class OPENVINO_GENAI_EXPORTS Image2ImagePipeline { +public: + explicit Image2ImagePipeline(const std::filesystem::path& models_path); + + Image2ImagePipeline(const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties = {}); + + template ::value, bool>::type = true> + Image2ImagePipeline(const std::filesystem::path& models_path, + const std::string& device, + Properties&&... properties) + : Image2ImagePipeline(models_path, device, ov::AnyMap{std::forward(properties)...}) { } + + // creates either LCM or SD pipeline from building blocks + static Image2ImagePipeline stable_diffusion( + const std::shared_ptr& scheduler_type, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae); + + // creates either LCM or SD pipeline from building blocks + static Image2ImagePipeline latent_consistency_model( + const std::shared_ptr& scheduler_type, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae); + + // creates SDXL pipeline from building blocks + static Image2ImagePipeline stable_diffusion_xl( + const std::shared_ptr& scheduler_type, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae); + + ImageGenerationConfig get_generation_config() const; + void set_generation_config(const ImageGenerationConfig& generation_config); + + // ability to override scheduler + void set_scheduler(std::shared_ptr scheduler); + + // with static shapes performance is better + void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale); + + void compile(const std::string& device, const ov::AnyMap& properties = {}); + + template + ov::util::EnableIfAllStringAny compile( + const std::string& device, + Properties&&... properties) { + return compile(device, ov::AnyMap{std::forward(properties)...}); + } + + // Returns a tensor with the following dimensions [num_images_per_prompt, height, width, 3] + ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties = {}); + + template + ov::util::EnableIfAllStringAny generate( + const std::string& positive_prompt, + ov::Tensor initial_image, + Properties&&... properties) { + return generate(positive_prompt, initial_image, ov::AnyMap{std::forward(properties)...}); + } + + ov::Tensor decode(const ov::Tensor latent); + +private: + std::shared_ptr m_impl; + + explicit Image2ImagePipeline(const std::shared_ptr& impl); +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp new file mode 100644 index 0000000000..c970fa0e23 --- /dev/null +++ b/src/cpp/include/openvino/genai/image_generation/inpainting_pipeline.hpp @@ -0,0 +1,106 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +#include "openvino/core/any.hpp" +#include "openvino/runtime/tensor.hpp" + +#include "openvino/genai/image_generation/scheduler.hpp" +#include "openvino/genai/image_generation/generation_config.hpp" + +#include "openvino/genai/image_generation/clip_text_model.hpp" +#include "openvino/genai/image_generation/clip_text_model_with_projection.hpp" +#include "openvino/genai/image_generation/unet2d_condition_model.hpp" +#include "openvino/genai/image_generation/autoencoder_kl.hpp" + +namespace ov { +namespace genai { + +// forward declaration +class DiffusionPipeline; + +// +// Inpainting pipeline +// + +class OPENVINO_GENAI_EXPORTS InpaintingPipeline { +public: + explicit InpaintingPipeline(const std::filesystem::path& models_path); + + InpaintingPipeline(const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties = {}); + + template ::value, bool>::type = true> + InpaintingPipeline(const std::filesystem::path& models_path, + const std::string& device, + Properties&&... properties) + : InpaintingPipeline(models_path, device, ov::AnyMap{std::forward(properties)...}) { } + + // creates either LCM or SD pipeline from building blocks + static InpaintingPipeline stable_diffusion( + const std::shared_ptr& scheduler_type, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae); + + // creates either LCM or SD pipeline from building blocks + static InpaintingPipeline latent_consistency_model( + const std::shared_ptr& scheduler_type, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae); + + // creates SDXL pipeline from building blocks + static InpaintingPipeline stable_diffusion_xl( + const std::shared_ptr& scheduler_type, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae); + + ImageGenerationConfig get_generation_config() const; + void set_generation_config(const ImageGenerationConfig& generation_config); + + // ability to override scheduler + void set_scheduler(std::shared_ptr scheduler); + + // with static shapes performance is better + void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale); + + void compile(const std::string& device, const ov::AnyMap& properties = {}); + + template + ov::util::EnableIfAllStringAny compile( + const std::string& device, + Properties&&... properties) { + return compile(device, ov::AnyMap{std::forward(properties)...}); + } + + // Returns a tensor with the following dimensions [num_images_per_prompt, height, width, 3] + ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, const ov::AnyMap& properties = {}); + + template + ov::util::EnableIfAllStringAny generate( + const std::string& positive_prompt, + ov::Tensor initial_image, + ov::Tensor mask, + Properties&&... properties) { + return generate(positive_prompt, initial_image, mask, ov::AnyMap{std::forward(properties)...}); + } + + ov::Tensor decode(const ov::Tensor latent); + +private: + std::shared_ptr m_impl; + + explicit InpaintingPipeline(const std::shared_ptr& impl); +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp index 4fce33856f..b66ced748b 100644 --- a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp +++ b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp @@ -28,14 +28,32 @@ namespace genai { // forward declaration class DiffusionPipeline; -// -// Text to image pipeline -// - +/** + * Text to image pipelines which provides unified API to all supported models types. + * Models specific aspects are hidden in image generation config, which includes multiple prompts support or + * other specific parameters like max_sequence_length + */ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { public: + /** + * Initializes text to image generation pipeline from a folder with models. + * Note, such pipeline is not ready to use as models are not compiled internally. + * + * Typical scenario is to initialize models using this constructor and then reshape pipeline + * with 'reshape()' method and then perform compilation using 'compile()' method. + * @param models_path A models path to read models and config files from + */ explicit Text2ImagePipeline(const std::filesystem::path& models_path); + /** + * Initializes text to image pipelines from a folder with models and performs compilation after it + * @param models_path A models path to read models and config files from + * @param device A single device used for all models + * @param properties Properties to pass to 'compile_model' or other pipeline properties like LoRA adapters + * @note If you want to compile each model on a dedicated device or with specific properties, you can create + * models individually and then combine a final pipeline using static methods like 'latent_consistency_model' or + * 'stable_diffusion_3'. See 'samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp' for example + */ Text2ImagePipeline(const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties = {}); template (properties)...}) { } - // creates either LCM or SD pipeline from building blocks + /** + * Creates Stable Diffusion pipeline from individual models + * @param scheduler A scheduler used to denoise final image + * @param clip_text_model A CLIP text encoder model + * @param unet An Unet model + * @param vae VAE auto encoder model + */ static Text2ImagePipeline stable_diffusion( const std::shared_ptr& scheduler, const CLIPTextModel& clip_text_model, const UNet2DConditionModel& unet, const AutoencoderKL& vae); - // creates either LCM or SD pipeline from building blocks + /** + * Creates Latent Consistency Model pipeline from individual models + * @param scheduler A scheduler used to denoise final image + * @param clip_text_model A CLIP text encoder model + * @param unet An Unet denoising model + * @param vae VAE auto encoder model + */ static Text2ImagePipeline latent_consistency_model( const std::shared_ptr& scheduler, const CLIPTextModel& clip_text_model, const UNet2DConditionModel& unet, const AutoencoderKL& vae); - // creates SDXL pipeline from building blocks + /** + * Creates Stable Diffusion XL pipeline from individual models + * @param scheduler A scheduler used to denoise final image + * @param clip_text_model A CLIP text encoder model + * @param clip_text_model_with_projection A CLIP text encoder with projection model + * @param unet An Unet denoising model + * @param vae VAE auto encoder model + */ static Text2ImagePipeline stable_diffusion_xl( const std::shared_ptr& scheduler, const CLIPTextModel& clip_text_model, @@ -67,7 +104,15 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { const UNet2DConditionModel& unet, const AutoencoderKL& vae); - // creates SD3 pipeline from building blocks + /** + * Creates Stable Diffusion 3 pipeline from individual models with T5 text encoder + * @param scheduler A scheduler used to denoise final image + * @param clip_text_model_1 A first CLIP text encoder model + * @param clip_text_model_1 A second CLIP text encoder model + * @param t5_encoder_model A T5 text encoder model. + * @param transformer A Transformer denoising model + * @param vae VAE auto encoder model + */ static Text2ImagePipeline stable_diffusion_3( const std::shared_ptr& scheduler, const CLIPTextModelWithProjection& clip_text_model_1, @@ -76,7 +121,29 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { const SD3Transformer2DModel& transformer, const AutoencoderKL& vae); - // creates Flux pipeline from building blocks + /** + * Creates Stable Diffusion 3 pipeline from individual models without T5 text encoder + * @param scheduler A scheduler used to denoise final image + * @param clip_text_model_1 A first CLIP text encoder model + * @param clip_text_model_1 A second CLIP text encoder model + * @param transformer A Transformer denoising model + * @param vae VAE auto encoder model + */ + static Text2ImagePipeline stable_diffusion_3( + const std::shared_ptr& scheduler, + const CLIPTextModelWithProjection& clip_text_model_1, + const CLIPTextModelWithProjection& clip_text_model_2, + const SD3Transformer2DModel& transformer, + const AutoencoderKL& vae); + + /** + * Creates FLUX pipeline from individual models + * @param scheduler A scheduler used to denoise final image + * @param clip_text_model A CLIP text encoder model + * @param t5_encoder_model A T5 text encoder model + * @param transformer A Transformer denoising model + * @param vae VAE auto encoder model + */ static Text2ImagePipeline flux( const std::shared_ptr& scheduler_type, const CLIPTextModel& clip_text_model, @@ -84,15 +151,42 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { const FluxTransformer2DModel& transformer, const AutoencoderKL& vae); + /** + * Returns default image generation config created internally based on model type. + * @returns Image generation config + */ ImageGenerationConfig get_generation_config() const; + + /** + * Sets image generation config + * @returns An image generation config + */ void set_generation_config(const ImageGenerationConfig& generation_config); - // ability to override scheduler + /** + * Overrides default scheduler used to denoise initial latent + * @param scheduler A scheduler to set to a pipeline + */ void set_scheduler(std::shared_ptr scheduler); - // with static shapes performance is better + /** + * Reshapes pipeline based on a given set of reshape parameters, which affect shapes of models within pipeline + * @note Reshaping can be useful to get maximum performance, but limit image generation to specific output sizes + * @param num_images_per_prompt A number of image to generate per 'generate()' call + * @param height A height of resulting image + * @param width A width of resulting image + * @param guidance_scale A guidance scale. Note, that it's important whether guidance_scale > 1, which affects whether negative prompts + * are used or not. For example, all values > 1 are the same for reshape perspective and may vary in subsequent 'generate()' calls. + * @note If pipeline has been already compiled, it cannot be reshaped and an exception is thrown. + */ void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale); + /** + * Compiles image generation pipeline for a given device + * @param device A device to compile models with + * @param properties A map of properties which affect models compilation + * @note If pipeline was compiled before, an exception is thrown. + */ void compile(const std::string& device, const ov::AnyMap& properties = {}); template @@ -102,7 +196,12 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { return compile(device, ov::AnyMap{std::forward(properties)...}); } - // Returns a tensor with the following dimensions [num_images_per_prompt, height, width, 3] + /** + * Generates image(s) based on prompt and other image generarion parameters + * @param positive_prompt Prompt to generate image(s) from + * @param properties Image generation parameters specified as properties. Values in 'properties' override default value for generation parameters. + * @return A tensor which has dimensions [num_images_per_prompt, height, width, 3] + */ ov::Tensor generate(const std::string& positive_prompt, const ov::AnyMap& properties = {}); template @@ -112,6 +211,11 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { return generate(positive_prompt, ov::AnyMap{std::forward(properties)...}); } + /** + * Performs latent image decoding. It can be useful to use within 'callback' which accepts current latent image + * @param latent A latent image + * @returns An image decoding with VAE auto encoder + */ ov::Tensor decode(const ov::Tensor latent); private: diff --git a/src/cpp/src/image_generation/diffusion_pipeline.hpp b/src/cpp/src/image_generation/diffusion_pipeline.hpp index 42e35101cc..86d8ba9009 100644 --- a/src/cpp/src/image_generation/diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/diffusion_pipeline.hpp @@ -4,6 +4,7 @@ #pragma once #include +#include #include "image_generation/schedulers/ischeduler.hpp" #include "openvino/genai/image_generation/generation_config.hpp" @@ -51,7 +52,8 @@ namespace genai { enum class PipelineType { TEXT_2_IMAGE = 0, - IMAGE_2_IMAGE = 1 + IMAGE_2_IMAGE = 1, + INPAINTING = 2, }; class DiffusionPipeline { @@ -78,11 +80,13 @@ class DiffusionPipeline { virtual void compile(const std::string& device, const ov::AnyMap& properties) = 0; - virtual ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const = 0; + virtual std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const = 0; virtual void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) = 0; - virtual ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties) = 0; + virtual void set_lora_adapters(std::optional adapters) = 0; + + virtual ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask_image, const ov::AnyMap& properties) = 0; virtual ov::Tensor decode(const ov::Tensor latent) = 0; @@ -95,6 +99,39 @@ class DiffusionPipeline { virtual void check_inputs(const ImageGenerationConfig& generation_config, ov::Tensor initial_image) const = 0; + void blend_latents(ov::Tensor image_latent, ov::Tensor noise, ov::Tensor mask, ov::Tensor latent, size_t inference_step) { + OPENVINO_ASSERT(m_pipeline_type == PipelineType::INPAINTING, "'prepare_mask_latents' can be called for inpainting pipeline only"); + OPENVINO_ASSERT(image_latent.get_shape() == latent.get_shape(), "Shapes for current", latent.get_shape(), "and initial image latents ", image_latent.get_shape(), " must match"); + + ov::Tensor noised_image_latent(image_latent.get_element_type(), {}); + std::vector timesteps = m_scheduler->get_timesteps(); + + if (inference_step < timesteps.size() - 1) { + image_latent.copy_to(noised_image_latent); + + int64_t noise_timestep = timesteps[inference_step + 1]; + m_scheduler->add_noise(noised_image_latent, noise, noise_timestep); + } else { + noised_image_latent = image_latent; + } + + ov::Shape shape = image_latent.get_shape(); + size_t batch_size = shape[0], in_channels = shape[1], channel_size = shape[2] * shape[3]; + OPENVINO_ASSERT(batch_size == 1, "Batch size 1 is supported for now"); + + const float * mask_data = mask.data(); + const float * noised_image_latent_data = noised_image_latent.data(); + float * latent_data = latent.data(); + + // blend initial noised and processed latents + for (size_t i = 0; i < channel_size; ++i) { + float mask_value = mask_data[i]; + for (size_t j = 0; j < in_channels; ++j) { + latent_data[j * channel_size + i] = (1.0f - mask_value) * noised_image_latent_data[j * channel_size + i] + mask_value * latent_data[j * channel_size + i]; + } + } + } + PipelineType m_pipeline_type; std::shared_ptr m_scheduler; ImageGenerationConfig m_generation_config; diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp index 8a9d6aba5c..4cdac5bb1a 100644 --- a/src/cpp/src/image_generation/flux_pipeline.hpp +++ b/src/cpp/src/image_generation/flux_pipeline.hpp @@ -1,6 +1,8 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include @@ -136,7 +138,13 @@ class FluxPipeline : public DiffusionPipeline { const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { - m_vae = std::make_shared(root_dir / "vae_decoder"); + if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) + m_vae = std::make_shared(root_dir / "vae_decoder"); + else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { + m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder"); + } else { + OPENVINO_ASSERT("Unsupported pipeline type"); + } } else { OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); } @@ -145,7 +153,7 @@ class FluxPipeline : public DiffusionPipeline { if (transformer == "FluxTransformer2DModel") { m_transformer = std::make_shared(root_dir / "transformer"); } else { - OPENVINO_THROW("Unsupported '", transformer, "'Transformer type"); + OPENVINO_THROW("Unsupported '", transformer, "' Transformer type"); } // initialize generation config @@ -182,7 +190,13 @@ class FluxPipeline : public DiffusionPipeline { const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { - m_vae = std::make_shared(root_dir / "vae_decoder", device, properties); + if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) + m_vae = std::make_shared(root_dir / "vae_decoder", device, properties); + else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { + m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder", device, properties); + } else { + OPENVINO_ASSERT("Unsupported pipeline type"); + } } else { OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); } @@ -191,7 +205,7 @@ class FluxPipeline : public DiffusionPipeline { if (transformer == "FluxTransformer2DModel") { m_transformer = std::make_shared(root_dir / "transformer", device, properties); } else { - OPENVINO_THROW("Unsupported '", transformer, "'Transformer type"); + OPENVINO_THROW("Unsupported '", transformer, "' Transformer type"); } // initialize generation config @@ -240,27 +254,19 @@ class FluxPipeline : public DiffusionPipeline { void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override { // encode_prompt - std::string prompt_2_str = - generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt; + std::string prompt_2_str = generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt; - m_clip_text_encoder->infer(positive_prompt, "", false); - ov::Tensor pooled_prompt_embeds_out = m_clip_text_encoder->get_output_tensor(1); + m_clip_text_encoder->infer(positive_prompt, {}, false); + ov::Tensor pooled_prompt_embeds = m_clip_text_encoder->get_output_tensor(1); + ov::Tensor prompt_embeds = m_t5_text_encoder->infer(prompt_2_str, "", false, generation_config.max_sequence_length); - ov::Tensor prompt_embeds_out = m_t5_text_encoder->infer(prompt_2_str, "", false, generation_config.max_sequence_length); - - ov::Tensor pooled_prompt_embeds, prompt_embeds; - if (generation_config.num_images_per_prompt == 1) { - pooled_prompt_embeds = pooled_prompt_embeds_out; - prompt_embeds = prompt_embeds_out; - } else { - pooled_prompt_embeds = numpy_utils::repeat(pooled_prompt_embeds_out, generation_config.num_images_per_prompt); - prompt_embeds = numpy_utils::repeat(prompt_embeds_out, generation_config.num_images_per_prompt); - } + pooled_prompt_embeds = numpy_utils::repeat(pooled_prompt_embeds, generation_config.num_images_per_prompt); + prompt_embeds = numpy_utils::repeat(prompt_embeds, generation_config.num_images_per_prompt); // text_ids = torch.zeros(prompt_embeds.shape[1], 3) ov::Shape text_ids_shape = {prompt_embeds.get_shape()[1], 3}; ov::Tensor text_ids(ov::element::f32, text_ids_shape); - std::fill_n(text_ids.data(), text_ids_shape[0] * text_ids_shape[1], 0.0f); + std::fill_n(text_ids.data(), text_ids.get_size(), 0.0f); const size_t num_channels_latents = m_transformer->get_config().in_channels / 4; const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); @@ -281,8 +287,7 @@ class FluxPipeline : public DiffusionPipeline { m_transformer->set_hidden_states("img_ids", latent_image_ids); } - ov::Tensor prepare_latents(ov::Tensor initial_image, - const ImageGenerationConfig& generation_config) const override { + std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); size_t num_channels_latents = m_transformer->get_config().in_channels / 4; @@ -293,15 +298,25 @@ class FluxPipeline : public DiffusionPipeline { num_channels_latents, height, width}; + ov::Tensor latent(ov::element::f32, {}), proccesed_image, image_latent, noise; - ov::Tensor latents_input = generation_config.generator->randn_tensor(latent_shape); - ov::Tensor latents = pack_latents(latents_input, generation_config.num_images_per_prompt, num_channels_latents, height, width); + if (initial_image) { + OPENVINO_THROW("StableDiffusion3 image to image is not implemented"); + } else { + noise = generation_config.generator->randn_tensor(latent_shape); + latent = pack_latents(noise, generation_config.num_images_per_prompt, num_channels_latents, height, width); + } - return latents; + return std::make_tuple(latent, proccesed_image, image_latent, noise); + } + + void set_lora_adapters(std::optional adapters) override { + OPENVINO_THROW("LORA adapters are not implemented for FLUX pipeline yet"); } ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, + ov::Tensor mask_image, const ov::AnyMap& properties) override { m_custom_generation_config = m_generation_config; m_custom_generation_config.update_generation_config(properties); @@ -311,6 +326,11 @@ class FluxPipeline : public DiffusionPipeline { m_custom_generation_config.strength = 1.0f; } + if (!initial_image) { + // in case of typical text to image generation, we need to ignore 'strength' + m_custom_generation_config.strength = 1.0f; + } + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); const auto& transformer_config = m_transformer->get_config(); @@ -323,7 +343,8 @@ class FluxPipeline : public DiffusionPipeline { compute_hidden_states(positive_prompt, m_custom_generation_config); - ov::Tensor latents = prepare_latents(initial_image, m_custom_generation_config); + ov::Tensor latents, processed_image, image_latent, noise; + std::tie(latents, processed_image, image_latent, noise) = prepare_latents(initial_image, m_custom_generation_config); size_t image_seq_len = latents.get_shape()[1]; float mu = m_scheduler->calculate_shift(image_seq_len); @@ -384,9 +405,16 @@ class FluxPipeline : public DiffusionPipeline { m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor; m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor; - if (class_name == "FluxPipeline") { - m_generation_config.guidance_scale = 3.5f; - m_generation_config.num_inference_steps = 28; + if (class_name == "FluxPipeline" || class_name == "FluxImg2ImgPipeline" || class_name == "FluxInpaintPipeline" ) { + if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) { + m_generation_config.guidance_scale = 3.5f; + m_generation_config.num_inference_steps = 28; + m_generation_config.strength = 1.0f; + } else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { + m_generation_config.guidance_scale = 3.5f; + m_generation_config.num_inference_steps = 28; + m_generation_config.strength = 1.0f; + } m_generation_config.max_sequence_length = 512; } else { OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers"); @@ -398,7 +426,7 @@ class FluxPipeline : public DiffusionPipeline { // const size_t vae_scale_factor = m_transformer->get_vae_scale_factor(); const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && (width % vae_scale_factor == 0 || width < 0), - "Both 'width' and 'height' must be divisible by", + "Both 'width' and 'height' must be divisible by ", vae_scale_factor); } @@ -411,6 +439,22 @@ class FluxPipeline : public DiffusionPipeline { OPENVINO_ASSERT(generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used by FluxPipeline"); OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by FluxPipeline"); OPENVINO_ASSERT(generation_config.prompt_3 == std::nullopt, "Prompt 3 is not used by FluxPipeline"); + + if ((m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) && initial_image) { + ov::Shape initial_image_shape = initial_image.get_shape(); + size_t height = initial_image_shape[1], width = initial_image_shape[2]; + + OPENVINO_ASSERT(generation_config.height == height, + "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); + OPENVINO_ASSERT(generation_config.width == width, + "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); + + OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f, + "'Strength' generation parameter must be withion [0, 1] range"); + } else { + OPENVINO_ASSERT(generation_config.strength == 1.0f, "'Strength' generation parameter must be 1.0f for Text 2 image pipeline"); + OPENVINO_ASSERT(!initial_image, "Internal error: initial_image must be empty for Text 2 image pipeline"); + } } std::shared_ptr m_transformer = nullptr; diff --git a/src/cpp/src/image_generation/image2image_pipeline.cpp b/src/cpp/src/image_generation/image2image_pipeline.cpp new file mode 100644 index 0000000000..527b532b71 --- /dev/null +++ b/src/cpp/src/image_generation/image2image_pipeline.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include + +#include "openvino/genai/image_generation/image2image_pipeline.hpp" + +#include "image_generation/stable_diffusion_pipeline.hpp" +#include "image_generation/stable_diffusion_xl_pipeline.hpp" + +#include "utils.hpp" + +namespace ov { +namespace genai { + +Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir) { + const std::string class_name = get_class_name(root_dir); + + if (class_name == "StableDiffusionPipeline" || class_name == "LatentConsistencyModelPipeline") { + m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir); + } else if (class_name == "StableDiffusionXLPipeline") { + m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir); + } else { + OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); + } +} + +Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) { + const std::string class_name = get_class_name(root_dir); + + if (class_name == "StableDiffusionPipeline" || class_name == "LatentConsistencyModelPipeline") { + m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties); + } else if (class_name == "StableDiffusionXLPipeline") { + m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties); + } else { + OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); + } +} + +Image2ImagePipeline::Image2ImagePipeline(const std::shared_ptr& impl) + : m_impl(impl) { + assert(m_impl != nullptr); +} + +Image2ImagePipeline Image2ImagePipeline::stable_diffusion( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae) { + auto impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, clip_text_model, unet, vae); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Image2ImagePipeline(impl); +} + +Image2ImagePipeline Image2ImagePipeline::latent_consistency_model( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae) { + auto impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, clip_text_model, unet, vae); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Image2ImagePipeline(impl); +} + +Image2ImagePipeline Image2ImagePipeline::stable_diffusion_xl( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae) { + auto impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, clip_text_model, clip_text_model_with_projection, unet, vae); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Image2ImagePipeline(impl); +} + +ImageGenerationConfig Image2ImagePipeline::get_generation_config() const { + return m_impl->get_generation_config(); +} + +void Image2ImagePipeline::set_generation_config(const ImageGenerationConfig& generation_config) { + m_impl->set_generation_config(generation_config); +} + +void Image2ImagePipeline::set_scheduler(std::shared_ptr scheduler) { + m_impl->set_scheduler(scheduler); +} + +void Image2ImagePipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) { + m_impl->reshape(num_images_per_prompt, height, width, guidance_scale); +} + +void Image2ImagePipeline::compile(const std::string& device, const ov::AnyMap& properties) { + m_impl->compile(device, properties); +} + +ov::Tensor Image2ImagePipeline::generate(const std::string& positive_prompt, ov::Tensor initial_image, const ov::AnyMap& properties) { + OPENVINO_ASSERT(initial_image, "Initial image cannot be empty when passed to Image2ImagePipeline::generate"); + return m_impl->generate(positive_prompt, initial_image, {}, properties); +} + +ov::Tensor Image2ImagePipeline::decode(const ov::Tensor latent) { + return m_impl->decode(latent); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/image_generation/image_processor.cpp b/src/cpp/src/image_generation/image_processor.cpp new file mode 100644 index 0000000000..8c73ee2da0 --- /dev/null +++ b/src/cpp/src/image_generation/image_processor.cpp @@ -0,0 +1,169 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "image_generation/image_processor.hpp" + +#include + +#include "openvino/core/model.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/greater_eq.hpp" +#include "openvino/op/select.hpp" + +#include "utils.hpp" // for utils::singleton_core + +namespace ov { +namespace genai { + +namespace { + +std::shared_ptr create_empty_model(ov::element::Type type = ov::element::f32) { + auto parameter = std::make_shared(type, ov::PartialShape::dynamic(4)); + auto result = std::make_shared(parameter); + return std::make_shared(ov::ResultVector{result}, ov::ParameterVector{parameter}); +} + +} // namespace + +IImageProcessor::IImageProcessor(const std::string& device) : + m_device(device) { +} + +ov::Tensor IImageProcessor::execute(ov::Tensor image) { + m_request.set_input_tensor(image); + m_request.infer(); + return m_request.get_output_tensor(); +} + +void IImageProcessor::compile(std::shared_ptr model) { + m_request = utils::singleton_core().compile_model(model, m_device).create_infer_request(); +} + +ImageProcessor::ImageProcessor(const std::string& device, bool do_normalize, bool do_binarize) : + IImageProcessor(device) { + auto image_processor_model = create_empty_model(); + merge_image_preprocessing(image_processor_model, do_normalize, do_binarize); + + compile(image_processor_model); +} + +void ImageProcessor::merge_image_preprocessing(std::shared_ptr model, bool do_normalize, bool do_binarize) { + OPENVINO_ASSERT(do_normalize ^ do_binarize, "Both binarize and normalize are not supported"); + + // https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L90-L110 + ov::preprocess::PrePostProcessor ppp(model); + + ppp.input().tensor() + .set_layout("NHWC") + .set_element_type(ov::element::u8) + .set_color_format(ov::preprocess::ColorFormat::BGR); + ppp.input().model() + .set_layout("NCHW"); + + if (do_normalize) { + ppp.input().tensor().set_layout("NHWC"); + ppp.input().model().set_layout("NCHW"); + + ppp.input().tensor() + .set_element_type(ov::element::u8); + + ppp.input().preprocess() + .convert_layout() + .convert_element_type(ov::element::f32) + // this is less accurate that in VaeImageProcessor::normalize + .scale(255.0 / 2.0) + .mean(1.0f); + } else if (do_binarize) { + ppp.input().preprocess() + .convert_element_type(ov::element::f32) + .convert_color(ov::preprocess::ColorFormat::GRAY) + .scale(255.0f) + .custom([](const ov::Output& port) { + auto constant_0_5 = std::make_shared(ov::element::f32, ov::Shape{1}, 0.5f); + auto constant_1_0 = std::make_shared(ov::element::f32, ov::Shape{1}, 1.0f); + auto constant_0_0 = std::make_shared(ov::element::f32, ov::Shape{1}, 0.0f); + auto mask_bool = std::make_shared(port, constant_0_5); + auto mask_float = std::make_shared(mask_bool, constant_1_0, constant_0_0); + return mask_float; + }); + } + + ppp.build(); +} + +ImageResizer::ImageResizer(const std::string& device, ov::element::Type type, ov::Layout layout, ov::op::v11::Interpolate::InterpolateMode interpolation_mode) { + auto image_parameter = std::make_shared(type, ov::PartialShape::dynamic(4)); + image_parameter->get_output_tensor(0).add_names({"image"}); + + auto target_spatial_shape = std::make_shared(element::i64, Shape{2}); + target_spatial_shape->get_output_tensor(0).add_names({"target_spatial_shape"}); + + ov::PartialShape pshape = ov::PartialShape::dynamic(4); + const auto height_idx = static_cast(get_and_check_height_idx(layout, pshape)); + const auto width_idx = static_cast(get_and_check_width_idx(layout, pshape)); + + // In future consider replacing this to set of new OV operations like `getDimByName(node, "H")` + // This is to allow specifying layout on 'evaluation' stage + const auto axes = op::v0::Constant::create(element::i64, Shape{2}, {height_idx, width_idx}); + + op::util::InterpolateBase::InterpolateAttrs attrs(interpolation_mode, + op::util::InterpolateBase::ShapeCalcMode::SIZES, + {0, 0}, + {0, 0}); + + attrs.coordinate_transformation_mode = op::util::InterpolateBase::CoordinateTransformMode::ASYMMETRIC; + attrs.nearest_mode = op::util::InterpolateBase::NearestMode::FLOOR; + if (attrs.mode != op::util::InterpolateBase::InterpolateMode::NEAREST) { + attrs.coordinate_transformation_mode = op::util::InterpolateBase::CoordinateTransformMode::PYTORCH_HALF_PIXEL; + } + + const auto interp = std::make_shared(image_parameter, target_spatial_shape, axes, attrs); + + auto result = std::make_shared(interp); + auto resize_model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{image_parameter, target_spatial_shape}); + + m_request = utils::singleton_core().compile_model(resize_model, device).create_infer_request(); +} + +ov::Tensor ImageResizer::execute(ov::Tensor image, int64_t dst_height, int64_t dst_width) { + ov::Tensor target_spatial_tensor(ov::element::i64, ov::Shape{2}); + target_spatial_tensor.data()[0] = dst_height; + target_spatial_tensor.data()[1] = dst_width; + + m_request.set_tensor("image", image); + m_request.set_tensor("target_spatial_shape", target_spatial_tensor); + m_request.infer(); + + return m_request.get_output_tensor(); +} + +size_t ImageResizer::get_and_check_width_idx(const Layout& layout, const PartialShape& shape) { + OPENVINO_ASSERT(ov::layout::has_width(layout), "Layout ", layout.to_string(), " doesn't have `width` dimension"); + OPENVINO_ASSERT(shape.rank().is_static(), "Can't get shape width index for shape with dynamic rank"); + auto idx = ov::layout::width_idx(layout); + if (idx < 0) { + idx = shape.rank().get_length() + idx; + } + OPENVINO_ASSERT(idx >= 0 && shape.rank().get_length() > idx, + "Width dimension is out of bounds ", + std::to_string(idx)); + return idx; +} + +size_t ImageResizer::get_and_check_height_idx(const Layout& layout, const PartialShape& shape) { + OPENVINO_ASSERT(ov::layout::has_height(layout), "Layout ", layout.to_string(), " doesn't have `height` dimension"); + OPENVINO_ASSERT(shape.rank().is_static(), "Can't get shape height index for shape with dynamic rank"); + auto idx = ov::layout::height_idx(layout); + if (idx < 0) { + idx = shape.rank().get_length() + idx; + } + OPENVINO_ASSERT(idx >= 0 && shape.rank().get_length() > idx, + "Height dimension is out of bounds ", + std::to_string(idx)); + return idx; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/image_generation/image_processor.hpp b/src/cpp/src/image_generation/image_processor.hpp new file mode 100644 index 0000000000..d0ef7532aa --- /dev/null +++ b/src/cpp/src/image_generation/image_processor.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/core/layout.hpp" +#include "openvino/runtime/infer_request.hpp" + +#include "openvino/op/interpolate.hpp" + +namespace ov { +namespace genai { + +class IImageProcessor { +public: + explicit IImageProcessor(const std::string& device); + + virtual ~IImageProcessor() = default; + + virtual ov::Tensor execute(ov::Tensor image); + +protected: + void compile(std::shared_ptr model); + + ov::InferRequest m_request; + std::string m_device; +}; + +class ImageProcessor : public IImageProcessor { +public: + explicit ImageProcessor(const std::string& device, bool do_normalize = true, bool do_binarize = false); + + static void merge_image_preprocessing(std::shared_ptr model, bool do_normalize = true, bool do_binarize = false); +}; + +class ImageResizer { +public: + ImageResizer(const std::string& device, ov::element::Type type, ov::Layout layout, ov::op::v11::Interpolate::InterpolateMode interpolation_mode); + + ov::Tensor execute(ov::Tensor image, int64_t dst_height, int64_t dst_width); + +private: + size_t get_and_check_width_idx(const Layout& layout, const PartialShape& shape); + size_t get_and_check_height_idx(const Layout& layout, const PartialShape& shape); + + ov::InferRequest m_request; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/image_generation/inpainting_pipeline.cpp b/src/cpp/src/image_generation/inpainting_pipeline.cpp new file mode 100644 index 0000000000..d3612c4964 --- /dev/null +++ b/src/cpp/src/image_generation/inpainting_pipeline.cpp @@ -0,0 +1,122 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include + +#include "openvino/genai/image_generation/inpainting_pipeline.hpp" + +#include "image_generation/stable_diffusion_pipeline.hpp" +#include "image_generation/stable_diffusion_xl_pipeline.hpp" + +#include "utils.hpp" + +namespace ov { +namespace genai { + +InpaintingPipeline::InpaintingPipeline(const std::filesystem::path& root_dir) { + const std::string class_name = get_class_name(root_dir); + + if (class_name == "StableDiffusionPipeline" || + class_name == "LatentConsistencyModelPipeline" || + class_name == "StableDiffusionInpaintPipeline") { + m_impl = std::make_shared(PipelineType::INPAINTING, root_dir); + } else if (class_name == "StableDiffusionXLPipeline" || class_name == "StableDiffusionXLInpaintPipeline") { + m_impl = std::make_shared(PipelineType::INPAINTING, root_dir); + } else { + OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); + } +} + +InpaintingPipeline::InpaintingPipeline(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) { + const std::string class_name = get_class_name(root_dir); + + if (class_name == "StableDiffusionPipeline" || + class_name == "LatentConsistencyModelPipeline" || + class_name == "StableDiffusionInpaintPipeline") { + m_impl = std::make_shared(PipelineType::INPAINTING, root_dir, device, properties); + } else if (class_name == "StableDiffusionXLPipeline" || class_name == "StableDiffusionXLInpaintPipeline") { + m_impl = std::make_shared(PipelineType::INPAINTING, root_dir, device, properties); + } else { + OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); + } +} + +InpaintingPipeline::InpaintingPipeline(const std::shared_ptr& impl) + : m_impl(impl) { + assert(m_impl != nullptr); +} + +InpaintingPipeline InpaintingPipeline::stable_diffusion( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae) { + auto impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, clip_text_model, unet, vae); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return InpaintingPipeline(impl); +} + +InpaintingPipeline InpaintingPipeline::latent_consistency_model( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae) { + auto impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, clip_text_model, unet, vae); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return InpaintingPipeline(impl); +} + +InpaintingPipeline InpaintingPipeline::stable_diffusion_xl( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae) { + auto impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, clip_text_model, clip_text_model_with_projection, unet, vae); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return InpaintingPipeline(impl); +} + +ImageGenerationConfig InpaintingPipeline::get_generation_config() const { + return m_impl->get_generation_config(); +} + +void InpaintingPipeline::set_generation_config(const ImageGenerationConfig& generation_config) { + m_impl->set_generation_config(generation_config); +} + +void InpaintingPipeline::set_scheduler(std::shared_ptr scheduler) { + m_impl->set_scheduler(scheduler); +} + +void InpaintingPipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) { + m_impl->reshape(num_images_per_prompt, height, width, guidance_scale); +} + +void InpaintingPipeline::compile(const std::string& device, const ov::AnyMap& properties) { + m_impl->compile(device, properties); +} + +ov::Tensor InpaintingPipeline::generate(const std::string& positive_prompt, ov::Tensor initial_image, ov::Tensor mask, const ov::AnyMap& properties) { + OPENVINO_ASSERT(initial_image, "Initial image cannot be empty when passed to InpaintingPipeline::generate"); + OPENVINO_ASSERT(mask, "Mask image cannot be empty when passed to InpaintingPipeline::generate"); + return m_impl->generate(positive_prompt, initial_image, mask, properties); +} + +ov::Tensor InpaintingPipeline::decode(const ov::Tensor latent) { + return m_impl->decode(latent); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp index 7c38cd77fa..e0d6a44189 100644 --- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp +++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp @@ -102,8 +102,6 @@ AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_encoder_path, : AutoencoderKL(vae_decoder_path) { ov::Core core = utils::singleton_core(); m_encoder_model = core.read_model((vae_encoder_path / "openvino_model.xml").string()); - // apply VaeImageProcessor pre-processing steps by merging them into the VAE encoder - merge_vae_image_pre_processing(); } AutoencoderKL::AutoencoderKL(const std::filesystem::path& vae_decoder_path, @@ -147,8 +145,6 @@ AutoencoderKL::AutoencoderKL(const std::string& vae_encoder_model, : AutoencoderKL(vae_decoder_model, vae_decoder_weights, vae_decoder_config) { ov::Core core = utils::singleton_core(); m_encoder_model = core.read_model(vae_encoder_model, vae_encoder_weights); - // apply VaeImageProcessor pre-processing steps by merging them into the VAE encoder - merge_vae_image_pre_processing(); } AutoencoderKL::AutoencoderKL(const std::string& vae_decoder_model, @@ -191,7 +187,7 @@ AutoencoderKL& AutoencoderKL::reshape(int batch_size, int height, int width) { const size_t vae_scale_factor = get_vae_scale_factor(); OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && - (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by", + (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by ", vae_scale_factor); if (m_encoder_model) { @@ -275,27 +271,6 @@ size_t AutoencoderKL::get_vae_scale_factor() const { return std::pow(2, m_config.block_out_channels.size() - 1); } -void AutoencoderKL::merge_vae_image_pre_processing() const { - ov::preprocess::PrePostProcessor ppp(m_encoder_model); - - // https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L90-L110 - - ppp.input().tensor().set_layout("NHWC"); - ppp.input().model().set_layout("NCHW"); - - ppp.input().tensor() - .set_element_type(ov::element::u8); - - ppp.input().preprocess() - .convert_layout() - .convert_element_type(ov::element::f32) - // this is less accurate that in VaeImageProcessor::normalize - .scale(255.0 / 2.0) - .mean(1.0f); - - ppp.build(); -} - void AutoencoderKL::merge_vae_image_post_processing() const { ov::preprocess::PrePostProcessor ppp(m_decoder_model); diff --git a/src/cpp/src/image_generation/numpy_utils.cpp b/src/cpp/src/image_generation/numpy_utils.cpp index 83052dbbf5..d8929d0267 100644 --- a/src/cpp/src/image_generation/numpy_utils.cpp +++ b/src/cpp/src/image_generation/numpy_utils.cpp @@ -74,88 +74,10 @@ std::vector interp(const std::vector& x, const std::vector< return interp_res; } -namespace { - -void concat_3d_axis_2(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { - OPENVINO_ASSERT(shape_1.size() == 3 && shape_2.size() == 3, "Shape dimensions must be 3"); - OPENVINO_ASSERT(shape_1[0] == shape_2[0] && shape_1[1] == shape_2[1], "Tensors for concatenation must have the same dimensions"); - - for (size_t i = 0; i < shape_1[0]; ++i) { - for (size_t j = 0; j < shape_1[1]; ++j) { - size_t offset_1 = (i * shape_1[1] + j) * shape_1[2]; - size_t offset_2 = (i * shape_2[1] + j) * shape_2[2]; - - size_t step = (i * shape_1[1] + j) * (shape_1[2] + shape_2[2]); - - std::memcpy(res + step, data_1 + offset_1, shape_1[2] * sizeof(float)); - std::memcpy(res + step + shape_1[2], data_2 + offset_2, shape_2[2] * sizeof(float)); - } - } -} - -void concat_2d_axis_1(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { - OPENVINO_ASSERT(shape_1.size() == 2 && shape_2.size() == 2, "Shape dimensions must be 2"); - OPENVINO_ASSERT(shape_1[0] == shape_2[0], "Tensors for concatenation must have the same dimensions"); - - for (size_t i = 0; i < shape_1[0]; ++i) { - size_t offset_1 = i * shape_1[1]; - size_t offset_2 = i * shape_2[1]; - - size_t step = i * (shape_1[1] + shape_2[1]); - - std::memcpy(res + step, data_1 + offset_1, shape_1[1] * sizeof(float)); - std::memcpy(res + step + shape_1[1], - data_2 + offset_2, - shape_2[1] * sizeof(float)); - } -} - -void concat_3d_axis_1(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { - OPENVINO_ASSERT(shape_1.size() == 3 && shape_2.size() == 3, "Shape dimensions must be 3"); - OPENVINO_ASSERT(shape_1[0] == shape_2[0] && shape_1[2] == shape_2[2], "Tensors for concatenation must have the same dimensions"); - - for (size_t i = 0; i < shape_1[0]; ++i) { - size_t shift_1 = i * shape_1[1] * shape_1[2]; - size_t shift_2 = i * shape_2[1] * shape_2[2]; - - size_t step = shift_1 + shift_2; - - std::memcpy(res + step, data_1 + shift_1, shape_1[1] * shape_1[2] * sizeof(float)); - std::memcpy(res + step + shape_1[1] * shape_1[2], data_2 + shift_2, shape_2[1] * shape_2[2] * sizeof(float)); - } -} - -void concat_3d_axis_0(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { - OPENVINO_ASSERT(shape_1.size() == 3 && shape_2.size() == 3, "Shape dimensions must be 3"); - OPENVINO_ASSERT(shape_1[1] == shape_2[1] && shape_1[2] == shape_2[2], "Tensors for concatenation must have the same dimensions"); - - size_t size_1 = shape_1[0] * shape_1[1] * shape_1[2]; - size_t size_2 = shape_2[0] * shape_2[1] * shape_2[2]; - - std::memcpy(res, data_1, size_1 * sizeof(float)); - std::memcpy(res + size_1, data_2, size_2 * sizeof(float)); -} - -void concat_2d_axis_0(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { - OPENVINO_ASSERT(shape_1.size() == 2 && shape_2.size() == 2, "Shape dimensions must be 2"); - OPENVINO_ASSERT(shape_1[1] == shape_2[1], "Tensors for concatenation must have the same dimensions"); - - size_t size_1 = shape_1[0] * shape_1[1]; - size_t size_2 = shape_2[0] * shape_2[1]; - - std::memcpy(res, data_1, size_1 * sizeof(float)); - std::memcpy(res + size_1, data_2, size_2 * sizeof(float)); -} - -} // namespace - ov::Tensor concat(ov::Tensor tensor_1, ov::Tensor tensor_2, int axis) { ov::Shape shape_1 = tensor_1.get_shape(), shape_2 = tensor_2.get_shape(); size_t rank = shape_1.size(); - const size_t MAX_RANK = 3; - OPENVINO_ASSERT(rank <= MAX_RANK, "Maximum support rank of concatenated tensors is ", MAX_RANK, ", given rank is ", rank); - OPENVINO_ASSERT(rank == shape_2.size(), "Shapes for concatenated tensors must have the same rank"); OPENVINO_ASSERT(tensor_1.get_element_type() == ov::element::f32 && tensor_2.get_element_type() == ov::element::f32, "Concat supports only tensor of fp32 data type"); @@ -170,18 +92,31 @@ ov::Tensor concat(ov::Tensor tensor_1, ov::Tensor tensor_2, int axis) { dst_shape[d] = d == axis ? shape_1[d] + shape_2[d] : shape_1[d]; } - typedef void (*concat_func_type) (const float*, const float*, float*, const ov::Shape, const ov::Shape); - concat_func_type concat_funcs [MAX_RANK][MAX_RANK] = { - { nullptr, nullptr, nullptr }, - { concat_2d_axis_0, concat_2d_axis_1, nullptr }, - { concat_3d_axis_0, concat_3d_axis_1, concat_3d_axis_2 } - }; + size_t num_iterations = 1; + for (size_t d = 0; d < axis; ++d) { + num_iterations *= shape_1[d]; + } - concat_func_type concat_func = concat_funcs[rank - 1][axis]; - OPENVINO_ASSERT(concat_func != nullptr, "Unsupported combination of input tensors rank ", rank, " and axis ", axis); + size_t chunk_1 = 1, chunk_2 = 1; + for (size_t d = axis; d < shape_1.size(); ++d) { + chunk_1 *= shape_1[d]; + chunk_2 *= shape_2[d]; + } ov::Tensor dst_tensor(tensor_1.get_element_type(), dst_shape); - concat_func(tensor_1.data(), tensor_2.data(), dst_tensor.data(), shape_1, shape_2); + float * res = dst_tensor.data(); + + const float * data_1 = tensor_1.data(); + const float * data_2 = tensor_2.data(); + + for (size_t i = 0; i < num_iterations; ++i) { + std::memcpy(res , data_1, chunk_1 * sizeof(float)); + std::memcpy(res + chunk_1, data_2, chunk_2 * sizeof(float)); + + res += chunk_1 + chunk_2; + data_1 += chunk_1; + data_2 += chunk_2; + } return dst_tensor; } @@ -200,12 +135,16 @@ void batch_copy(ov::Tensor src, ov::Tensor dst, size_t src_batch, size_t dst_bat ov::Tensor(src, src_start, src_end).copy_to(ov::Tensor(dst, dst_start, dst_end)); } -ov::Tensor repeat(const ov::Tensor input, const size_t num_images_per_prompt) { - ov::Shape repeated_shape = input.get_shape(); - repeated_shape[0] *= num_images_per_prompt; +ov::Tensor repeat(const ov::Tensor input, const size_t n_times) { + if (n_times == 1) + return input; + + ov::Shape input_shape = input.get_shape(), repeated_shape = input_shape; + repeated_shape[0] *= n_times; + ov::Tensor tensor_repeated(input.get_element_type(), repeated_shape); - for (size_t n = 0; n < num_images_per_prompt; ++n) { - batch_copy(input, tensor_repeated, 0, n); + for (size_t n = 0; n < n_times; ++n) { + batch_copy(input, tensor_repeated, 0, n, input_shape[0]); } return tensor_repeated; } diff --git a/src/cpp/src/image_generation/schedulers/ddim.cpp b/src/cpp/src/image_generation/schedulers/ddim.cpp index 4b9b914d05..768ba56837 100644 --- a/src/cpp/src/image_generation/schedulers/ddim.cpp +++ b/src/cpp/src/image_generation/schedulers/ddim.cpp @@ -205,21 +205,18 @@ void DDIMScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) return; } -void DDIMScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr generator) const { - int64_t latent_timestep = m_timesteps.front(); - +void DDIMScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const { float sqrt_alpha_prod = std::sqrt(m_alphas_cumprod[latent_timestep]); float sqrt_one_minus_alpha_prod = std::sqrt(1.0 - m_alphas_cumprod[latent_timestep]); - ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape()); - float * init_latent_data = init_latent.data(); - const float * rand_tensor_data = rand_tensor.data(); + const float * noise_data = noise.data(); for (size_t i = 0; i < init_latent.get_size(); ++i) { - init_latent_data[i] = sqrt_alpha_prod * init_latent_data[i] + sqrt_one_minus_alpha_prod * rand_tensor_data[i]; + init_latent_data[i] = sqrt_alpha_prod * init_latent_data[i] + sqrt_one_minus_alpha_prod * noise_data[i]; } } + } // namespace genai } // namespace ov diff --git a/src/cpp/src/image_generation/schedulers/ddim.hpp b/src/cpp/src/image_generation/schedulers/ddim.hpp index f561ded149..7e042cf03e 100644 --- a/src/cpp/src/image_generation/schedulers/ddim.hpp +++ b/src/cpp/src/image_generation/schedulers/ddim.hpp @@ -45,7 +45,7 @@ class DDIMScheduler : public IScheduler { std::map step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) override; - void add_noise(ov::Tensor init_latent, std::shared_ptr generator) const override; + virtual void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t timestep) const override; private: Config m_config; diff --git a/src/cpp/src/image_generation/schedulers/euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_discrete.cpp index 74c587432b..fe5c185437 100644 --- a/src/cpp/src/image_generation/schedulers/euler_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/euler_discrete.cpp @@ -301,17 +301,14 @@ size_t EulerDiscreteScheduler::_index_for_timestep(int64_t timestep) const { OPENVINO_THROW("Failed to find index for timestep ", timestep); } -void EulerDiscreteScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr generator) const { - const int64_t latent_timestep = m_timesteps.front(); +void EulerDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const { const float sigma = m_sigmas[_index_for_timestep(latent_timestep)]; - ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape()); - float * init_latent_data = init_latent.data(); - const float * rand_tensor_data = rand_tensor.data(); + const float * noise_data = noise.data(); for (size_t i = 0; i < init_latent.get_size(); ++i) { - init_latent_data[i] = init_latent_data[i] + sigma * rand_tensor_data[i]; + init_latent_data[i] = init_latent_data[i] + sigma * noise_data[i]; } } diff --git a/src/cpp/src/image_generation/schedulers/euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/euler_discrete.hpp index 59e20c1be0..43620e7de4 100644 --- a/src/cpp/src/image_generation/schedulers/euler_discrete.hpp +++ b/src/cpp/src/image_generation/schedulers/euler_discrete.hpp @@ -47,7 +47,7 @@ class EulerDiscreteScheduler : public IScheduler { std::map step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) override; - void add_noise(ov::Tensor init_latent, std::shared_ptr generator) const override; + void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override; private: Config m_config; diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp index 54ea36f7d1..21d464d7f9 100644 --- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp @@ -141,7 +141,7 @@ void FlowMatchEulerDiscreteScheduler::init_step_index() { m_step_index = (m_begin_index == -1) ? 0 : m_begin_index; } -void FlowMatchEulerDiscreteScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr generator) const { +void FlowMatchEulerDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const { // use https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py#L117 OPENVINO_THROW("Not implemented"); } diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp index 1386551a60..6410790b92 100644 --- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp +++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp @@ -40,7 +40,7 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler { std::map step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) override; - void add_noise(ov::Tensor init_latent, std::shared_ptr generator) const override; + void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override; float calculate_shift(size_t image_seq_len) override; diff --git a/src/cpp/src/image_generation/schedulers/ischeduler.hpp b/src/cpp/src/image_generation/schedulers/ischeduler.hpp index 18c6572e5d..a6f61b5343 100644 --- a/src/cpp/src/image_generation/schedulers/ischeduler.hpp +++ b/src/cpp/src/image_generation/schedulers/ischeduler.hpp @@ -26,7 +26,7 @@ class IScheduler : public Scheduler { virtual std::map step( ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) = 0; - virtual void add_noise(ov::Tensor init_latent, std::shared_ptr generator) const = 0; + virtual void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const = 0; virtual float calculate_shift(size_t image_seq_len) { OPENVINO_THROW("Scheduler doesn't support `calculate_shift` method"); diff --git a/src/cpp/src/image_generation/schedulers/lcm.cpp b/src/cpp/src/image_generation/schedulers/lcm.cpp index cc1f9a774f..89aed5b5f8 100644 --- a/src/cpp/src/image_generation/schedulers/lcm.cpp +++ b/src/cpp/src/image_generation/schedulers/lcm.cpp @@ -243,19 +243,15 @@ std::vector LCMScheduler::threshold_sample(const std::vector& flat return thresholded_sample; } -void LCMScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr generator) const { - int64_t latent_timestep = m_timesteps.front(); - +void LCMScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const { float sqrt_alpha_prod = std::sqrt(m_alphas_cumprod[latent_timestep]); float sqrt_one_minus_alpha_prod = std::sqrt(1.0f - m_alphas_cumprod[latent_timestep]); - ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape()); - float * init_latent_data = init_latent.data(); - const float * rand_tensor_data = rand_tensor.data(); + const float * noise_data = noise.data(); for (size_t i = 0; i < init_latent.get_size(); ++i) { - init_latent_data[i] = sqrt_alpha_prod * init_latent_data[i] + sqrt_one_minus_alpha_prod * rand_tensor_data[i]; + init_latent_data[i] = sqrt_alpha_prod * init_latent_data[i] + sqrt_one_minus_alpha_prod * noise_data[i]; } } diff --git a/src/cpp/src/image_generation/schedulers/lcm.hpp b/src/cpp/src/image_generation/schedulers/lcm.hpp index 0304cd301a..b9e1a55270 100644 --- a/src/cpp/src/image_generation/schedulers/lcm.hpp +++ b/src/cpp/src/image_generation/schedulers/lcm.hpp @@ -52,7 +52,7 @@ class LCMScheduler : public IScheduler { std::map step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) override; - void add_noise(ov::Tensor init_latent, std::shared_ptr generator) const override; + void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override; private: Config m_config; diff --git a/src/cpp/src/image_generation/schedulers/lms_discrete.cpp b/src/cpp/src/image_generation/schedulers/lms_discrete.cpp index 125ebf326d..d8c3c23745 100644 --- a/src/cpp/src/image_generation/schedulers/lms_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/lms_discrete.cpp @@ -254,7 +254,7 @@ std::map LMSDiscreteScheduler::step(ov::Tensor noise_pr return result; } -void LMSDiscreteScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr generator) const { +void LMSDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const { // use https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_ddim.py#L474 OPENVINO_THROW("Not implemented"); } diff --git a/src/cpp/src/image_generation/schedulers/lms_discrete.hpp b/src/cpp/src/image_generation/schedulers/lms_discrete.hpp index 7a6b9d314f..53a3eb8c39 100644 --- a/src/cpp/src/image_generation/schedulers/lms_discrete.hpp +++ b/src/cpp/src/image_generation/schedulers/lms_discrete.hpp @@ -41,7 +41,7 @@ class LMSDiscreteScheduler : public IScheduler { std::map step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) override; - void add_noise(ov::Tensor init_latent, std::shared_ptr generator) const override; + void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override; private: Config m_config; diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp index 5a5057062c..c4280b3064 100644 --- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp @@ -1,6 +1,8 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include @@ -10,6 +12,7 @@ #include "openvino/genai/image_generation/autoencoder_kl.hpp" #include "openvino/genai/image_generation/clip_text_model.hpp" #include "openvino/genai/image_generation/clip_text_model_with_projection.hpp" +#include "openvino/genai/image_generation/t5_encoder_model.hpp" #include "openvino/genai/image_generation/sd3_transformer_2d_model.hpp" #include "utils.hpp" @@ -80,25 +83,28 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { OPENVINO_THROW("Unsupported '", text_encoder_2, "' text encoder type"); } - const std::string text_encoder_3 = data["text_encoder_3"][1].get(); - if (text_encoder_3 == "T5EncoderModel") { - m_t5_text_encoder = std::make_shared(root_dir / "text_encoder_3"); - } else { - m_t5_text_encoder = nullptr; + const auto text_encoder_3_json = data["text_encoder_3"][1]; + if (!text_encoder_3_json.is_null()) { + const std::string text_encoder_3 = text_encoder_3_json.get(); + if (text_encoder_3 == "T5EncoderModel") { + m_t5_text_encoder = std::make_shared(root_dir / "text_encoder_3"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder_3, "' text encoder type"); + } } const std::string transformer = data["transformer"][1].get(); if (transformer == "SD3Transformer2DModel") { m_transformer = std::make_shared(root_dir / "transformer"); } else { - OPENVINO_THROW("Unsupported '", transformer, "'Transformer type"); + OPENVINO_THROW("Unsupported '", transformer, "' Transformer type"); } const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) m_vae = std::make_shared(root_dir / "vae_decoder"); - else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { + else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder"); } else { OPENVINO_ASSERT("Unsupported pipeline type"); @@ -141,23 +147,28 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { OPENVINO_THROW("Unsupported '", text_encoder_2, "' text encoder type"); } - const std::string text_encoder_3 = data["text_encoder_3"][1].get(); - if (text_encoder_3 == "T5EncoderModel") { - m_t5_text_encoder = std::make_shared(root_dir / "text_encoder_3", device, properties); + const auto text_encoder_3_json = data["text_encoder_3"][1]; + if (!text_encoder_3_json.is_null()) { + const std::string text_encoder_3 = text_encoder_3_json.get(); + if (text_encoder_3 == "T5EncoderModel") { + m_t5_text_encoder = std::make_shared(root_dir / "text_encoder_3", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder_3, "' text encoder type"); + } } const std::string transformer = data["transformer"][1].get(); if (transformer == "SD3Transformer2DModel") { m_transformer = std::make_shared(root_dir / "transformer", device, properties); } else { - OPENVINO_THROW("Unsupported '", transformer, "'Transformer type"); + OPENVINO_THROW("Unsupported '", transformer, "' Transformer type"); } const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) m_vae = std::make_shared(root_dir / "vae_decoder", device, properties); - else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { + else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder", device, properties); } else { OPENVINO_ASSERT("Unsupported pipeline type"); @@ -187,6 +198,19 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { initialize_generation_config("StableDiffusion3Pipeline"); } + StableDiffusion3Pipeline(PipelineType pipeline_type, + const CLIPTextModelWithProjection& clip_text_model_1, + const CLIPTextModelWithProjection& clip_text_model_2, + const SD3Transformer2DModel& transformer, + const AutoencoderKL& vae) + : DiffusionPipeline(pipeline_type), + m_clip_text_encoder_1(std::make_shared(clip_text_model_1)), + m_clip_text_encoder_2(std::make_shared(clip_text_model_2)), + m_vae(std::make_shared(vae)), + m_transformer(std::make_shared(transformer)) { + initialize_generation_config("StableDiffusion3Pipeline"); + } + void reshape(const int num_images_per_prompt, const int height, const int width, @@ -197,7 +221,9 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { do_classifier_free_guidance(guidance_scale) ? 2 : 1; // Transformer accepts 2x batch in case of CFG m_clip_text_encoder_1->reshape(batch_size_multiplier); m_clip_text_encoder_2->reshape(batch_size_multiplier); - m_t5_text_encoder->reshape(batch_size_multiplier, m_generation_config.max_sequence_length); + if (m_t5_text_encoder) { + m_t5_text_encoder->reshape(batch_size_multiplier, m_generation_config.max_sequence_length); + } m_transformer->reshape(num_images_per_prompt * batch_size_multiplier, height, width, @@ -210,7 +236,9 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { m_clip_text_encoder_1->compile(device, properties); m_clip_text_encoder_2->compile(device, properties); - m_t5_text_encoder->compile(device, properties); + if (m_t5_text_encoder) { + m_t5_text_encoder->compile(device, properties); + } m_transformer->compile(device, properties); m_vae->compile(device, properties); } @@ -245,17 +273,17 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { ov::Tensor text_encoder_2_hidden_state = m_clip_text_encoder_2->get_output_tensor(idx_hidden_state_2); ov::Tensor text_encoder_3_output; - if (m_t5_text_encoder == nullptr) { + if (m_t5_text_encoder) { + text_encoder_3_output = m_t5_text_encoder->infer(prompt_3_str, + negative_prompt_3_str, + do_classifier_free_guidance(generation_config.guidance_scale), + generation_config.max_sequence_length); + } else { ov::Shape t5_prompt_embed_shape = {generation_config.num_images_per_prompt, m_clip_text_encoder_1->get_config().max_position_embeddings, transformer_config.joint_attention_dim}; text_encoder_3_output = ov::Tensor(ov::element::f32, t5_prompt_embed_shape); std::fill_n(text_encoder_3_output.data(), text_encoder_3_output.get_size(), 0.0f); - } else { - text_encoder_3_output = m_t5_text_encoder->infer(prompt_3_str, - negative_prompt_3_str, - do_classifier_free_guidance(generation_config.guidance_scale), - m_generation_config.max_sequence_length); } ov::Tensor pooled_prompt_embed_out, prompt_embed_out, pooled_prompt_2_embed_out, prompt_2_embed_out, t5_prompt_embed_out; @@ -298,7 +326,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { // padding for clip_prompt_embeds ov::Shape pad_embeds_shape = {clip_prompt_embeds_shape[0], clip_prompt_embeds_shape[1], t5_prompt_embed_shape[2]}; ov::Tensor pad_embeds(ov::element::f32, pad_embeds_shape); - padding_right(clip_prompt_embeds, pad_embeds); // prompt_embeds = torch.cat([pad_embeds, t5_prompt_embed], dim=-2) @@ -356,31 +383,38 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { m_transformer->set_hidden_states("pooled_projections", pooled_prompt_embeds_inp); } - ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { + std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); ov::Shape latent_shape{generation_config.num_images_per_prompt, m_transformer->get_config().in_channels, generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; - ov::Tensor latent(ov::element::f32, {}); + ov::Tensor latent(ov::element::f32, {}), proccesed_image, image_latent, noise; if (initial_image) { OPENVINO_THROW("StableDiffusion3 image to image is not implemented"); } else { - latent = generation_config.generator->randn_tensor(latent_shape); + noise = generation_config.generator->randn_tensor(latent_shape); + latent.set_shape(latent_shape); // latents are multiplied by 'init_noise_sigma' + const float * noise_data = noise.data(); float * latent_data = latent.data(); for (size_t i = 0; i < latent.get_size(); ++i) - latent_data[i] *= m_scheduler->get_init_noise_sigma(); + latent_data[i] = noise_data[i] * m_scheduler->get_init_noise_sigma(); } - return latent; + return std::make_tuple(latent, proccesed_image, image_latent, noise); + } + + void set_lora_adapters(std::optional adapters) override { + OPENVINO_THROW("LORA adapters are not implemented for Stable Diffusion 3 yet"); } ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, + ov::Tensor mask_image, const ov::AnyMap& properties) override { ImageGenerationConfig generation_config = m_generation_config; generation_config.update_generation_config(properties); @@ -416,7 +450,8 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { compute_hidden_states(positive_prompt, generation_config); // 5. Prepare latent variables - ov::Tensor latent = prepare_latents(initial_image, generation_config); + ov::Tensor latent, processed_image, image_latent, noise; + std::tie(latent, processed_image, image_latent, noise) = prepare_latents(initial_image, generation_config); ov::Shape latent_shape_cfg = latent.get_shape(); latent_shape_cfg[0] *= batch_size_multiplier; @@ -497,10 +532,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { m_generation_config.height = transformer_config.sample_size * vae_scale_factor; m_generation_config.width = transformer_config.sample_size * vae_scale_factor; - if (class_name == "StableDiffusion3Pipeline") { + if (class_name == "StableDiffusion3Pipeline" || class_name == "StableDiffusion3Img2ImgPipeline" || class_name == "StableDiffusion3InpaintPipeline") { m_generation_config.guidance_scale = 7.0f; m_generation_config.num_inference_steps = 28; m_generation_config.max_sequence_length = 256; + m_generation_config.strength = m_pipeline_type == PipelineType::TEXT_2_IMAGE ? 1.0f : 0.6f; } else { OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers"); } @@ -515,7 +551,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { OPENVINO_ASSERT((height % (vae_scale_factor * patch_size) == 0 || height < 0) && (width % (vae_scale_factor * patch_size) == 0 || width < 0), - "Both 'width' and 'height' must be divisible by", + "Both 'width' and 'height' must be divisible by ", vae_scale_factor); } @@ -532,7 +568,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used when guidance scale < 1.0"); - if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) { + if ((m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) && initial_image) { ov::Shape initial_image_shape = initial_image.get_shape(); size_t height = initial_image_shape[1], width = initial_image_shape[2]; diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp index b7153f282a..9dbdbac088 100644 --- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp @@ -1,28 +1,53 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include #include #include "image_generation/diffusion_pipeline.hpp" #include "image_generation/numpy_utils.hpp" +#include "image_generation/image_processor.hpp" #include "openvino/genai/image_generation/autoencoder_kl.hpp" #include "openvino/genai/image_generation/clip_text_model.hpp" #include "openvino/genai/image_generation/clip_text_model_with_projection.hpp" #include "openvino/genai/image_generation/unet2d_condition_model.hpp" +#include "openvino/runtime/core.hpp" + #include "json_utils.hpp" #include "lora_helper.hpp" +#include "debug_utils.hpp" +#include "numpy_utils.hpp" namespace ov { namespace genai { class StableDiffusionPipeline : public DiffusionPipeline { public: - StableDiffusionPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir) : + explicit StableDiffusionPipeline(PipelineType pipeline_type) : DiffusionPipeline(pipeline_type) { + // TODO: support GPU as well + const std::string device = "CPU"; + + if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { + const bool do_normalize = true, do_binarize = false; + m_image_processor = std::make_shared(device, do_normalize, do_binarize); + m_image_resizer = std::make_shared(device, ov::element::u8, "NHWC", ov::op::v11::Interpolate::InterpolateMode::BICUBIC_PILLOW); + } + + if (m_pipeline_type == PipelineType::INPAINTING) { + const bool do_normalize = false, do_binarize = true; + m_mask_processor = std::make_shared(device, do_normalize, do_binarize); + m_mask_resizer = std::make_shared(device, ov::element::f32, "NCHW", ov::op::v11::Interpolate::InterpolateMode::NEAREST); + } + } + + StableDiffusionPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir) : + StableDiffusionPipeline(pipeline_type) { const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); @@ -50,7 +75,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { if (vae == "AutoencoderKL") { if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) m_vae = std::make_shared(root_dir / "vae_decoder"); - else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { + else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder"); } else { OPENVINO_ASSERT("Unsupported pipeline type"); @@ -64,7 +89,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { } StableDiffusionPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) : - DiffusionPipeline(pipeline_type) { + StableDiffusionPipeline(pipeline_type) { const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); @@ -92,7 +117,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { if (vae == "AutoencoderKL") { if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) m_vae = std::make_shared(root_dir / "vae_decoder", device, properties); - else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { + else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder", device, properties); } else { OPENVINO_ASSERT("Unsupported pipeline type"); @@ -112,10 +137,11 @@ class StableDiffusionPipeline : public DiffusionPipeline { const CLIPTextModel& clip_text_model, const UNet2DConditionModel& unet, const AutoencoderKL& vae) - : DiffusionPipeline(pipeline_type), - m_clip_text_encoder(std::make_shared(clip_text_model)), - m_unet(std::make_shared(unet)), - m_vae(std::make_shared(vae)) { + : StableDiffusionPipeline(pipeline_type) { + m_clip_text_encoder = std::make_shared(clip_text_model); + m_unet = std::make_shared(unet); + m_vae = std::make_shared(vae); + const bool is_lcm = m_unet->get_config().time_cond_proj_dim > 0; const char * const pipeline_name = is_lcm ? "LatentConsistencyModelPipeline" : "StableDiffusionPipeline"; initialize_generation_config(pipeline_name); @@ -172,38 +198,103 @@ class StableDiffusionPipeline : public DiffusionPipeline { } } - ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { - const auto& unet_config = m_unet->get_config(); + std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { + std::vector timesteps = m_scheduler->get_timesteps(); + OPENVINO_ASSERT(!timesteps.empty(), "Timesteps are not computed yet"); + int64_t latent_timestep = timesteps.front(); + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); + const bool is_inpainting = m_pipeline_type == PipelineType::INPAINTING, + is_strength_max = is_inpainting && generation_config.strength == 1.0f, + is_inpainting_model = is_inpainting && m_unet->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1), + return_image_latent = is_inpainting && !is_inpainting_model; - ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, + ov::Shape latent_shape{generation_config.num_images_per_prompt, m_vae->get_config().latent_channels, generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; - ov::Tensor latent; + ov::Tensor latent(ov::element::f32, {}), proccesed_image, image_latent, noise; if (initial_image) { - latent = m_vae->encode(initial_image, generation_config.generator); - if (generation_config.num_images_per_prompt > 1) { - ov::Tensor batched_latent(ov::element::f32, latent_shape); - for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { - numpy_utils::batch_copy(latent, batched_latent, 0, n); + proccesed_image = m_image_resizer->execute(initial_image, generation_config.height, generation_config.width); + proccesed_image = m_image_processor->execute(proccesed_image); + + // prepate image latent for cases: + // - image to image + // - inpainting with strength < 1.0 + // - inpainting with non-specialized model + if (!is_strength_max || return_image_latent) { + image_latent = m_vae->encode(proccesed_image, generation_config.generator); + + // in case of image to image or inpaining with strength < 1.0, we need to initialize initial latent with image_latent + if (!is_strength_max) { + image_latent.copy_to(latent); + latent = numpy_utils::repeat(latent, generation_config.num_images_per_prompt); } - latent = batched_latent; } - m_scheduler->add_noise(latent, generation_config.generator); + } + + noise = generation_config.generator->randn_tensor(latent_shape); + + if (!latent.get_shape().empty()) { + m_scheduler->add_noise(latent, noise, latent_timestep); } else { - latent = generation_config.generator->randn_tensor(latent_shape); + latent.set_shape(latent_shape); - // latents are multiplied by 'init_noise_sigma' + // if pure noise then scale the initial latents by the Scheduler's init sigma + const float * noise_data = noise.data(); float * latent_data = latent.data(); for (size_t i = 0; i < latent.get_size(); ++i) - latent_data[i] *= m_scheduler->get_init_noise_sigma(); + latent_data[i] = noise_data[i] * m_scheduler->get_init_noise_sigma(); + } + + return std::make_tuple(latent, proccesed_image, image_latent, noise); + } + + std::tuple prepare_mask_latents(ov::Tensor mask_image, ov::Tensor processed_image, const ImageGenerationConfig& generation_config) { + OPENVINO_ASSERT(m_pipeline_type == PipelineType::INPAINTING, "'prepare_mask_latents' can be called for inpainting pipeline only"); + + const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); + const bool is_inpainting_model = m_unet->get_config().in_channels == (m_vae->get_config().latent_channels * 2 + 1); + ov::Shape target_shape = processed_image.get_shape(); + + ov::Tensor mask_condition = m_image_resizer->execute(mask_image, target_shape[2], target_shape[3]); + mask_condition = m_mask_processor->execute(mask_condition); + + // resize mask to shape of latent space + ov::Tensor mask = m_mask_resizer->execute(mask_condition, target_shape[2] / vae_scale_factor, target_shape[3] / vae_scale_factor); + mask = numpy_utils::repeat(mask, generation_config.num_images_per_prompt * batch_size_multiplier); + + ov::Tensor masked_image_latent; + + if (is_inpainting_model) { + // create masked image + ov::Tensor masked_image(ov::element::f32, processed_image.get_shape()); + const float * mask_condition_data = mask_condition.data(); + const float * processed_image_data = processed_image.data(); + float * masked_image_data = masked_image.data(); + + for (size_t i = 0, plane_size = mask_condition.get_shape()[2] * mask_condition.get_shape()[3]; i < mask_condition.get_size(); ++i) { + masked_image_data[i + 0 * plane_size] = mask_condition_data[i] < 0.5f ? processed_image_data[i + 0 * plane_size] : 0.0f; + masked_image_data[i + 1 * plane_size] = mask_condition_data[i] < 0.5f ? processed_image_data[i + 1 * plane_size] : 0.0f; + masked_image_data[i + 2 * plane_size] = mask_condition_data[i] < 0.5f ? processed_image_data[i + 2 * plane_size] : 0.0f; + } + + // encode masked image to latent scape + masked_image_latent = m_vae->encode(masked_image, generation_config.generator); + masked_image_latent = numpy_utils::repeat(masked_image_latent, generation_config.num_images_per_prompt * batch_size_multiplier); } - return latent; + return std::make_tuple(mask, masked_image_latent); + } + + void set_lora_adapters(std::optional adapters) override { + m_clip_text_encoder->set_adapters(adapters); + m_unet->set_adapters(adapters); } ov::Tensor generate(const std::string& positive_prompt, ov::Tensor initial_image, + ov::Tensor mask_image, const ov::AnyMap& properties) override { using namespace numpy_utils; ImageGenerationConfig generation_config = m_generation_config; @@ -214,21 +305,29 @@ class StableDiffusionPipeline : public DiffusionPipeline { generation_config.strength = 1.0f; } + // use callback if defined + std::function callback; + auto callback_iter = properties.find(ov::genai::callback.name()); + bool do_callback = callback_iter != properties.end(); + if (do_callback) { + callback = callback_iter->second.as>(); + } + // Stable Diffusion pipeline // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline const auto& unet_config = m_unet->get_config(); const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); + const bool is_inpainting_model = unet_config.in_channels == (m_vae->get_config().latent_channels * 2 + 1); if (generation_config.height < 0) - generation_config.height = unet_config.sample_size * vae_scale_factor; + compute_dim(generation_config.height, initial_image, 1 /* assume NHWC */); if (generation_config.width < 0) - generation_config.width = unet_config.sample_size * vae_scale_factor; + compute_dim(generation_config.width, initial_image, 2 /* assume NHWC */); check_inputs(generation_config, initial_image); - m_clip_text_encoder->set_adapters(generation_config.adapters); - m_unet->set_adapters(generation_config.adapters); + set_lora_adapters(generation_config.adapters); if (generation_config.generator == nullptr) { uint32_t seed = time(NULL); @@ -241,23 +340,22 @@ class StableDiffusionPipeline : public DiffusionPipeline { // compute text encoders and set hidden states compute_hidden_states(positive_prompt, generation_config); - // preparate initial latents - ov::Tensor latent = prepare_latents(initial_image, generation_config); + // preparate initial / image latents + ov::Tensor latent, processed_image, image_latent, noise; + std::tie(latent, processed_image, image_latent, noise) = prepare_latents(initial_image, generation_config); + + // prepare mask latents + ov::Tensor mask, masked_image_latent; + if (m_pipeline_type == PipelineType::INPAINTING) { + std::tie(mask, masked_image_latent) = prepare_mask_latents(mask_image, processed_image, generation_config); + } // prepare latents passed to models taking into account guidance scale (batch size multipler) ov::Shape latent_shape_cfg = latent.get_shape(); latent_shape_cfg[0] *= batch_size_multiplier; - ov::Tensor latent_cfg(ov::element::f32, latent_shape_cfg); - // use callback if defined - std::function callback; - auto callback_iter = properties.find(ov::genai::callback.name()); - bool do_callback = callback_iter != properties.end(); - if (do_callback) { - callback = callback_iter->second.as>(); - } + ov::Tensor latent_cfg(ov::element::f32, latent_shape_cfg), denoised, noisy_residual_tensor(ov::element::f32, {}), latent_model_input; - ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); for (size_t inference_step = 0; inference_step < timesteps.size(); inference_step++) { numpy_utils::batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); // concat the same latent twice along a batch dimension in case of CFG @@ -267,12 +365,13 @@ class StableDiffusionPipeline : public DiffusionPipeline { m_scheduler->scale_model_input(latent_cfg, inference_step); + ov::Tensor latent_model_input = is_inpainting_model ? numpy_utils::concat(numpy_utils::concat(latent_cfg, mask, 1), masked_image_latent, 1) : latent_cfg; ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); - ov::Tensor noise_pred_tensor = m_unet->infer(latent_cfg, timestep); + ov::Tensor noise_pred_tensor = m_unet->infer(latent_model_input, timestep); ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); noise_pred_shape[0] /= batch_size_multiplier; - + if (batch_size_multiplier > 1) { noisy_residual_tensor.set_shape(noise_pred_shape); @@ -292,6 +391,11 @@ class StableDiffusionPipeline : public DiffusionPipeline { auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step, generation_config.generator); latent = scheduler_step_result["latent"]; + // in case of non-specialized inpainting model, we need manually mask current denoised latent and initial image latent + if (m_pipeline_type == PipelineType::INPAINTING && !is_inpainting_model) { + blend_latents(image_latent, noise, mask, latent, inference_step); + } + // check whether scheduler returns "denoised" image, which should be passed to VAE decoder const auto it = scheduler_step_result.find("denoised"); denoised = it != scheduler_step_result.end() ? it->second : latent; @@ -310,21 +414,41 @@ class StableDiffusionPipeline : public DiffusionPipeline { return m_vae->decode(latent); } -private: +protected: + void compute_dim(int64_t & generation_config_value, ov::Tensor initial_image, int dim_idx) { + const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); + const auto& unet_config = m_unet->get_config(); + + // in case of image to image generation_config_value is just ignored and computed based on initial image + if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { + OPENVINO_ASSERT(initial_image, "Initial image is empty for image to image pipeline"); + ov::Shape shape = initial_image.get_shape(); + int64_t dim_val = shape[dim_idx]; + + generation_config_value = dim_val - (dim_val % vae_scale_factor); + } + + if (generation_config_value < 0) + generation_config_value = unet_config.sample_size * vae_scale_factor; + } + void initialize_generation_config(const std::string& class_name) override { assert(m_unet != nullptr); assert(m_vae != nullptr); const auto& unet_config = m_unet->get_config(); const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); - m_generation_config.height = unet_config.sample_size * vae_scale_factor; - m_generation_config.width = unet_config.sample_size * vae_scale_factor; + // in case of image to image, the shape is computed based on initial image + if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) { + m_generation_config.height = unet_config.sample_size * vae_scale_factor; + m_generation_config.width = unet_config.sample_size * vae_scale_factor; + } - if (class_name == "StableDiffusionPipeline") { + if (class_name == "StableDiffusionPipeline" || class_name == "StableDiffusionInpaintPipeline" || class_name == "StableDiffusionInpaintPipeline") { m_generation_config.guidance_scale = 7.5f; m_generation_config.num_inference_steps = 50; m_generation_config.strength = m_pipeline_type == PipelineType::IMAGE_2_IMAGE ? 0.8f : 1.0f; - } else if (class_name == "LatentConsistencyModelPipeline") { + } else if (class_name == "LatentConsistencyModelPipeline" || class_name == "LatentConsistencyModelImg2ImgPipeline") { m_generation_config.guidance_scale = 8.5f; m_generation_config.num_inference_steps = 4; m_generation_config.strength = m_pipeline_type == PipelineType::IMAGE_2_IMAGE ? 0.8f : 1.0f; @@ -337,7 +461,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { assert(m_vae != nullptr); const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && - (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by", + (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by ", vae_scale_factor); } @@ -358,14 +482,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { OPENVINO_ASSERT(generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used by ", pipeline_name); OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by ", pipeline_name); - if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) { - ov::Shape initial_image_shape = initial_image.get_shape(); - size_t height = initial_image_shape[1], width = initial_image_shape[2]; - - OPENVINO_ASSERT(generation_config.height == height, - "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); - OPENVINO_ASSERT(generation_config.width == width, - "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); + if ((m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) && initial_image) { OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f, "'Strength' generation parameter must be withion [0, 1] range"); } else { @@ -380,6 +497,8 @@ class StableDiffusionPipeline : public DiffusionPipeline { std::shared_ptr m_clip_text_encoder = nullptr; std::shared_ptr m_unet = nullptr; std::shared_ptr m_vae = nullptr; + std::shared_ptr m_image_processor = nullptr, m_mask_processor = nullptr; + std::shared_ptr m_image_resizer = nullptr, m_mask_resizer = nullptr; }; } // namespace genai diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp index a7eb84bd3d..698b4b28da 100644 --- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp @@ -1,78 +1,40 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include +#pragma once -#include "image_generation/diffusion_pipeline.hpp" - -#include "openvino/genai/image_generation/autoencoder_kl.hpp" -#include "openvino/genai/image_generation/clip_text_model.hpp" +#include "image_generation/stable_diffusion_pipeline.hpp" #include "openvino/genai/image_generation/clip_text_model_with_projection.hpp" -#include "openvino/genai/image_generation/unet2d_condition_model.hpp" - -#include "json_utils.hpp" namespace ov { namespace genai { -class StableDiffusionXLPipeline : public DiffusionPipeline { +class StableDiffusionXLPipeline : public StableDiffusionPipeline { public: StableDiffusionXLPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir) : - DiffusionPipeline(pipeline_type) { + StableDiffusionPipeline(pipeline_type, root_dir) { const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); nlohmann::json data = nlohmann::json::parse(file); - using utils::read_json_param; - - set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json")); - - const std::string text_encoder = data["text_encoder"][1].get(); - if (text_encoder == "CLIPTextModel") { - m_clip_text_encoder = std::make_shared(root_dir / "text_encoder"); - } else { - OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); - } const std::string text_encoder_2 = data["text_encoder_2"][1].get(); if (text_encoder_2 == "CLIPTextModelWithProjection") { m_clip_text_encoder_with_projection = std::make_shared(root_dir / "text_encoder_2"); } else { - OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); - } - - const std::string unet = data["unet"][1].get(); - if (unet == "UNet2DConditionModel") { - m_unet = std::make_shared(root_dir / "unet"); - } else { - OPENVINO_THROW("Unsupported '", unet, "' UNet type"); - } - - const std::string vae = data["vae"][1].get(); - if (vae == "AutoencoderKL") { - if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) - m_vae = std::make_shared(root_dir / "vae_decoder"); - else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { - m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder"); - } else { - OPENVINO_ASSERT("Unsupported pipeline type"); - } - } else { - OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + OPENVINO_THROW("Unsupported '", text_encoder_2, "' text encoder type"); } // initialize generation config initialize_generation_config(data["_class_name"].get()); // initialize force_zeros_for_empty_prompt, which is SDXL specific - read_json_param(data, "force_zeros_for_empty_prompt", m_force_zeros_for_empty_prompt); + utils::read_json_param(data, "force_zeros_for_empty_prompt", m_force_zeros_for_empty_prompt); } StableDiffusionXLPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) : - DiffusionPipeline(pipeline_type) { + StableDiffusionPipeline(pipeline_type) { const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); @@ -101,7 +63,7 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { properties_for_text_encoder(properties, "lora_te2") ); } else { - OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + OPENVINO_THROW("Unsupported '", text_encoder_2, "' text encoder type"); } const std::string unet = data["unet"][1].get(); @@ -121,9 +83,9 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) - m_vae = std::make_shared(root_dir / "vae_decoder", device, updated_roperties); - else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { - m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder", device, updated_roperties); + m_vae = std::make_shared(root_dir / "vae_decoder", device, properties); + else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { + m_vae = std::make_shared(root_dir / "vae_encoder", root_dir / "vae_decoder", device, properties); } else { OPENVINO_ASSERT("Unsupported pipeline type"); } @@ -146,11 +108,9 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { const CLIPTextModelWithProjection& clip_text_model_with_projection, const UNet2DConditionModel& unet, const AutoencoderKL& vae) - : DiffusionPipeline(pipeline_type), - m_clip_text_encoder(std::make_shared(clip_text_model)), - m_clip_text_encoder_with_projection(std::make_shared(clip_text_model_with_projection)), - m_unet(std::make_shared(unet)), - m_vae(std::make_shared(vae)) { + : StableDiffusionPipeline(pipeline_type, clip_text_model, unet, vae) { + m_clip_text_encoder_with_projection = std::make_shared(clip_text_model_with_projection); + // initialize generation config initialize_generation_config("StableDiffusionXLPipeline"); // here we implicitly imply that force_zeros_for_empty_prompt is set to True as by default in diffusers m_force_zeros_for_empty_prompt = true; @@ -194,14 +154,9 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { std::copy(time_ids.begin(), time_ids.end(), add_time_ids_data + time_ids.size()); } - std::string prompt_2_str = - generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt; - std::string negative_prompt_1_str = generation_config.negative_prompt != std::nullopt - ? *generation_config.negative_prompt - : std::string{}; - std::string negative_prompt_2_str = generation_config.negative_prompt_2 != std::nullopt - ? *generation_config.negative_prompt_2 - : negative_prompt_1_str; + std::string prompt_2_str = generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt; + std::string negative_prompt_1_str = generation_config.negative_prompt != std::nullopt ? *generation_config.negative_prompt : std::string{}; + std::string negative_prompt_2_str = generation_config.negative_prompt_2 != std::nullopt ? *generation_config.negative_prompt_2 : negative_prompt_1_str; // see https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L423-L427 bool force_zeros_for_empty_prompt = generation_config.negative_prompt == std::nullopt && m_force_zeros_for_empty_prompt; @@ -220,26 +175,7 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { ov::Tensor encoder_hidden_states_1 = m_clip_text_encoder->get_output_tensor(idx_hidden_state_1); ov::Tensor encoder_hidden_states_2 = m_clip_text_encoder_with_projection->get_output_tensor(idx_hidden_state_2); - ov::Shape ehs_1_shape = encoder_hidden_states_1.get_shape(); - ov::Shape ehs_2_shape = encoder_hidden_states_2.get_shape(); - - OPENVINO_ASSERT(ehs_1_shape[0] == ehs_2_shape[0] && ehs_1_shape[1] == ehs_2_shape[1], - "Tensors for concatenation must have the same dimensions"); - - // concatenate hidden_states from two encoders - ov::Shape encoder_hidden_states_shape = {ehs_1_shape[0], ehs_1_shape[1], ehs_1_shape[2] + ehs_2_shape[2]}; - encoder_hidden_states.set_shape(encoder_hidden_states_shape); - - const float* ehs_1_data = encoder_hidden_states_1.data(); - const float* ehs_2_data = encoder_hidden_states_2.data(); - float* encoder_hidden_states_data = encoder_hidden_states.data(); - - for (size_t i = 0; i < ehs_1_shape[0] * ehs_1_shape[1]; ++i, - encoder_hidden_states_data += encoder_hidden_states_shape[2], - ehs_1_data += ehs_1_shape[2], ehs_2_data += ehs_2_shape[2]) { - std::memcpy(encoder_hidden_states_data , ehs_1_data, ehs_1_shape[2] * sizeof(float)); - std::memcpy(encoder_hidden_states_data + ehs_1_shape[2], ehs_2_data, ehs_2_shape[2] * sizeof(float)); - } + encoder_hidden_states = numpy_utils::concat(encoder_hidden_states_1, encoder_hidden_states_2, -1); } else { ov::Tensor add_text_embeds_positive = m_clip_text_encoder_with_projection->infer(positive_prompt, negative_prompt_1_str, false); m_clip_text_encoder->infer(prompt_2_str, negative_prompt_2_str, false); @@ -342,142 +278,10 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { } } - ov::Tensor prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { - const auto& unet_config = m_unet->get_config(); - const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); - - ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, - generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; - ov::Tensor latent; - - if (initial_image) { - latent = m_vae->encode(initial_image, generation_config.generator); - if (generation_config.num_images_per_prompt > 1) { - ov::Tensor batched_latent(ov::element::f32, latent_shape); - for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { - numpy_utils::batch_copy(latent, batched_latent, 0, n); - } - latent = batched_latent; - } - m_scheduler->add_noise(latent, generation_config.generator); - } else { - latent = generation_config.generator->randn_tensor(latent_shape); - - // latents are multiplied by 'init_noise_sigma' - float * latent_data = latent.data(); - for (size_t i = 0; i < latent.get_size(); ++i) - latent_data[i] *= m_scheduler->get_init_noise_sigma(); - } - - return latent; - } - - ov::Tensor generate(const std::string& positive_prompt, - ov::Tensor initial_image, - const ov::AnyMap& properties) override { - ImageGenerationConfig generation_config = m_generation_config; - generation_config.update_generation_config(properties); - - if (!initial_image) { - // in case of typical text to image generation, we need to ignore 'strength' - generation_config.strength = 1.0f; - } - - // Stable Diffusion pipeline - // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline - - const auto& unet_config = m_unet->get_config(); - const size_t batch_size_multiplier = m_unet->do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG - const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); - - if (generation_config.height < 0) - generation_config.height = unet_config.sample_size * vae_scale_factor; - if (generation_config.width < 0) - generation_config.width = unet_config.sample_size * vae_scale_factor; - check_inputs(generation_config, initial_image); - - m_clip_text_encoder->set_adapters(generation_config.adapters); - m_clip_text_encoder_with_projection->set_adapters(generation_config.adapters); - m_unet->set_adapters(generation_config.adapters); - - if (generation_config.generator == nullptr) { - uint32_t seed = time(NULL); - generation_config.generator = std::make_shared(seed); - } - - m_scheduler->set_timesteps(generation_config.num_inference_steps, generation_config.strength); - std::vector timesteps = m_scheduler->get_timesteps(); - - // compute text encoders and set hidden states - compute_hidden_states(positive_prompt, generation_config); - - // preparate initial latents - ov::Tensor latent = prepare_latents(initial_image, generation_config); - - // prepare latents passed to models taking into account guidance scale (batch size multipler) - ov::Shape latent_shape_cfg = latent.get_shape(); - latent_shape_cfg[0] *= batch_size_multiplier; - ov::Tensor latent_cfg(ov::element::f32, latent_shape_cfg); - - // use callback if defined - std::function callback; - auto callback_iter = properties.find(ov::genai::callback.name()); - bool do_callback = callback_iter != properties.end(); - if (do_callback) { - callback = callback_iter->second.as>(); - } - - ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); - for (size_t inference_step = 0; inference_step < timesteps.size(); inference_step++) { - numpy_utils::batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); - // concat the same latent twice along a batch dimension in case of CFG - if (batch_size_multiplier > 1) { - numpy_utils::batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); - } - - m_scheduler->scale_model_input(latent_cfg, inference_step); - - ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); - ov::Tensor noise_pred_tensor = m_unet->infer(latent_cfg, timestep); - - ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); - noise_pred_shape[0] /= batch_size_multiplier; - - if (batch_size_multiplier > 1) { - noisy_residual_tensor.set_shape(noise_pred_shape); - - // perform guidance - float* noisy_residual = noisy_residual_tensor.data(); - const float* noise_pred_uncond = noise_pred_tensor.data(); - const float* noise_pred_text = noise_pred_uncond + noisy_residual_tensor.get_size(); - - for (size_t i = 0; i < noisy_residual_tensor.get_size(); ++i) { - noisy_residual[i] = noise_pred_uncond[i] + - generation_config.guidance_scale * (noise_pred_text[i] - noise_pred_uncond[i]); - } - } else { - noisy_residual_tensor = noise_pred_tensor; - } - - auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step, generation_config.generator); - latent = scheduler_step_result["latent"]; - - // check whether scheduler returns "denoised" image, which should be passed to VAE decoder - const auto it = scheduler_step_result.find("denoised"); - denoised = it != scheduler_step_result.end() ? it->second : latent; - - if (do_callback) { - if (callback(inference_step, denoised)) { - return ov::Tensor(ov::element::u8, {}); - } - } - } - - return decode(denoised); - } - - ov::Tensor decode(const ov::Tensor latent) override { - return m_vae->decode(latent); + void set_lora_adapters(std::optional adapters) override { + m_clip_text_encoder->set_adapters(adapters); + m_clip_text_encoder_with_projection->set_adapters(adapters); + m_unet->set_adapters(adapters); } private: @@ -490,23 +294,25 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { m_generation_config.height = unet_config.sample_size * vae_scale_factor; m_generation_config.width = unet_config.sample_size * vae_scale_factor; - if (class_name == "StableDiffusionXLPipeline") { - m_generation_config.guidance_scale = 5.0f; - m_generation_config.num_inference_steps = 50; - m_generation_config.strength = m_pipeline_type == PipelineType::IMAGE_2_IMAGE ? 0.3f : 1.0f; + if (class_name == "StableDiffusionXLPipeline" || class_name == "StableDiffusionXLImg2ImgPipeline" || class_name == "StableDiffusionXLInpaintPipeline") { + if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) { + m_generation_config.guidance_scale = 5.0f; + m_generation_config.num_inference_steps = 50; + m_generation_config.strength = 1.0f; + } else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { + m_generation_config.guidance_scale = 5.0f; + m_generation_config.num_inference_steps = 50; + m_generation_config.strength = 0.3f; + } else if (m_pipeline_type == PipelineType::INPAINTING) { + m_generation_config.guidance_scale = 7.5f; + m_generation_config.num_inference_steps = 50; + m_generation_config.strength == 0.9999f; + } } else { OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers"); } } - void check_image_size(const int height, const int width) const override { - assert(m_vae != nullptr); - const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); - OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && - (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by", - vae_scale_factor); - } - void check_inputs(const ImageGenerationConfig& generation_config, ov::Tensor initial_image) const override { check_image_size(generation_config.width, generation_config.height); @@ -518,14 +324,7 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used when guidance scale <= 1.0"); OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by ", pipeline_name); - if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) { - ov::Shape initial_image_shape = initial_image.get_shape(); - size_t height = initial_image_shape[1], width = initial_image_shape[2]; - - OPENVINO_ASSERT(generation_config.height == height, - "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); - OPENVINO_ASSERT(generation_config.width == width, - "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); + if ((m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) && initial_image) { OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f, "'Strength' generation parameter must be withion [0, 1] range"); } else { @@ -547,10 +346,7 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { friend class Image2ImagePipeline; bool m_force_zeros_for_empty_prompt = true; - std::shared_ptr m_clip_text_encoder = nullptr; std::shared_ptr m_clip_text_encoder_with_projection = nullptr; - std::shared_ptr m_unet = nullptr; - std::shared_ptr m_vae = nullptr; }; } // namespace genai diff --git a/src/cpp/src/image_generation/text2image_pipeline.cpp b/src/cpp/src/image_generation/text2image_pipeline.cpp index 66dda88454..6ceb076f85 100644 --- a/src/cpp/src/image_generation/text2image_pipeline.cpp +++ b/src/cpp/src/image_generation/text2image_pipeline.cpp @@ -111,6 +111,20 @@ Text2ImagePipeline Text2ImagePipeline::stable_diffusion_3( return Text2ImagePipeline(impl); } +Text2ImagePipeline Text2ImagePipeline::stable_diffusion_3( + const std::shared_ptr& scheduler, + const CLIPTextModelWithProjection& clip_text_model_1, + const CLIPTextModelWithProjection& clip_text_model_2, + const SD3Transformer2DModel& transformer, + const AutoencoderKL& vae){ + auto impl = std::make_shared(PipelineType::TEXT_2_IMAGE, clip_text_model_1, clip_text_model_2, transformer, vae); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Text2ImagePipeline(impl); +} + Text2ImagePipeline Text2ImagePipeline::flux( const std::shared_ptr& scheduler, const CLIPTextModel& clip_text_model, @@ -144,7 +158,7 @@ void Text2ImagePipeline::compile(const std::string& device, const ov::AnyMap& pr } ov::Tensor Text2ImagePipeline::generate(const std::string& positive_prompt, const ov::AnyMap& properties) { - return m_impl->generate(positive_prompt, {}, properties); + return m_impl->generate(positive_prompt, {}, {}, properties); } ov::Tensor Text2ImagePipeline::decode(const ov::Tensor latent) { diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md index c041e349ed..fe13e5848f 100644 --- a/src/docs/SUPPORTED_MODELS.md +++ b/src/docs/SUPPORTED_MODELS.md @@ -157,16 +157,20 @@ The pipeline can work with other similar topologies produced by `optimum-intel` > [!NOTE] > Models should belong to the same family and have the same tokenizers. -## Text 2 image models +## Image generation models + + + + + + + + + + + +
ArchitectureText 2 imageImage 2 image Example HuggingFace Models
Latent Consistency ModelSupportedSupported
Stable DiffusionSupportedSupported
Stable Diffusion XLSupportedSupported
Stable Diffusion 3SupportedNot supported
FluxSupportedNot supported
+## Inpainting models + +In addition to image generation models, `InpaintingPipeline` supports specialized inpainting models + + + + + + + + + + + + + + + + + +
ArchitectureExample HuggingFace Models
Stable Diffusion + +
Stable Diffusion XL + +
+ ## Visual language models diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index a97812261f..ca7c2c0b32 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -69,6 +69,8 @@ SD3Transformer2DModel, AutoencoderKL, Text2ImagePipeline, + Image2ImagePipeline, + InpaintingPipeline, Scheduler, ImageGenerationConfig, Generator, diff --git a/src/python/openvino_genai/__init__.pyi b/src/python/openvino_genai/__init__.pyi index e7e4e2619c..4d74e17588 100644 --- a/src/python/openvino_genai/__init__.pyi +++ b/src/python/openvino_genai/__init__.pyi @@ -19,7 +19,9 @@ from openvino_genai.py_openvino_genai import FluxTransformer2DModel from openvino_genai.py_openvino_genai import GenerationConfig from openvino_genai.py_openvino_genai import GenerationResult from openvino_genai.py_openvino_genai import Generator +from openvino_genai.py_openvino_genai import Image2ImagePipeline from openvino_genai.py_openvino_genai import ImageGenerationConfig +from openvino_genai.py_openvino_genai import InpaintingPipeline from openvino_genai.py_openvino_genai import LLMPipeline from openvino_genai.py_openvino_genai import PerfMetrics from openvino_genai.py_openvino_genai import RawPerfMetrics @@ -41,5 +43,5 @@ from openvino_genai.py_openvino_genai import WhisperRawPerfMetrics from openvino_genai.py_openvino_genai import draft_model import os as os from . import py_openvino_genai -__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'ImageGenerationConfig', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai'] +__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'openvino', 'os', 'py_openvino_genai'] __version__: str = '2025.0.0.0' diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 24bf6fd785..8ab0407ea7 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -5,7 +5,7 @@ from __future__ import annotations import openvino._pyopenvino import os import typing -__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'ImageGenerationConfig', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model'] +__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model'] class Adapter: """ Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. @@ -745,6 +745,78 @@ class Generator: """ def __init__(self) -> None: ... +class Image2ImagePipeline: + """ + This class is used for generation with image-to-image models. + """ + @staticmethod + def latent_consistency_model(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Image2ImagePipeline: + ... + @staticmethod + def stable_diffusion(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Image2ImagePipeline: + ... + @staticmethod + def stable_diffusion_xl(scheduler: Scheduler, clip_text_model: CLIPTextModel, clip_text_model_with_projection: CLIPTextModelWithProjection, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Image2ImagePipeline: + ... + @typing.overload + def __init__(self, models_path: os.PathLike) -> None: + """ + Image2ImagePipeline class constructor. + models_path (os.PathLike): Path to the folder with exported model files. + """ + @typing.overload + def __init__(self, models_path: os.PathLike, device: str, **kwargs) -> None: + """ + Image2ImagePipeline class constructor. + models_path (os.PathLike): Path with exported model files. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Image2ImagePipeline properties + """ + def compile(self, device: str, **kwargs) -> None: + """ + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def decode(self, latent: openvino._pyopenvino.Tensor) -> openvino._pyopenvino.Tensor: + ... + def generate(self, prompt: str, image: openvino._pyopenvino.Tensor, **kwargs) -> openvino._pyopenvino.Tensor: + """ + Generates images for text-to-image models. + + :param prompt: input prompt + :type prompt: str + + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. + + Expected parameters list: + prompt_2: str - second prompt, + prompt_3: str - third prompt, + negative_prompt: str - negative prompt, + negative_prompt_2: str - second negative prompt, + negative_prompt_3: str - third negative prompt, + num_images_per_prompt: int - number of images, that should be generated per prompt, + guidance_scale: float - guidance scale, + generation_config: GenerationConfig, + height: int - height of resulting images, + width: int - width of resulting images, + num_inference_steps: int - number of inference steps, + generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator, + adapters: LoRA adapters, + strength: strength for image to image generation. 1.0f means initial image is fully noised, + max_sequence_length: int - length of t5_encoder_model input + + :return: ov.Tensor with resulting images + :rtype: ov.Tensor + """ + def get_generation_config(self) -> ImageGenerationConfig: + ... + def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None: + ... + def set_generation_config(self, generation_config: ImageGenerationConfig) -> None: + ... + def set_scheduler(self, scheduler: Scheduler) -> None: + ... class ImageGenerationConfig: """ This class is used for storing generation config for image generation pipeline. @@ -769,6 +841,78 @@ class ImageGenerationConfig: ... def validate(self) -> None: ... +class InpaintingPipeline: + """ + This class is used for generation with inpainting models. + """ + @staticmethod + def latent_consistency_model(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> InpaintingPipeline: + ... + @staticmethod + def stable_diffusion(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> InpaintingPipeline: + ... + @staticmethod + def stable_diffusion_xl(scheduler: Scheduler, clip_text_model: CLIPTextModel, clip_text_model_with_projection: CLIPTextModelWithProjection, unet: UNet2DConditionModel, vae: AutoencoderKL) -> InpaintingPipeline: + ... + @typing.overload + def __init__(self, models_path: os.PathLike) -> None: + """ + InpaintingPipeline class constructor. + models_path (os.PathLike): Path to the folder with exported model files. + """ + @typing.overload + def __init__(self, models_path: os.PathLike, device: str, **kwargs) -> None: + """ + InpaintingPipeline class constructor. + models_path (os.PathLike): Path with exported model files. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: InpaintingPipeline properties + """ + def compile(self, device: str, **kwargs) -> None: + """ + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + """ + def decode(self, latent: openvino._pyopenvino.Tensor) -> openvino._pyopenvino.Tensor: + ... + def generate(self, prompt: str, image: openvino._pyopenvino.Tensor, mask_image: openvino._pyopenvino.Tensor, **kwargs) -> openvino._pyopenvino.Tensor: + """ + Generates images for text-to-image models. + + :param prompt: input prompt + :type prompt: str + + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. + + Expected parameters list: + prompt_2: str - second prompt, + prompt_3: str - third prompt, + negative_prompt: str - negative prompt, + negative_prompt_2: str - second negative prompt, + negative_prompt_3: str - third negative prompt, + num_images_per_prompt: int - number of images, that should be generated per prompt, + guidance_scale: float - guidance scale, + generation_config: GenerationConfig, + height: int - height of resulting images, + width: int - width of resulting images, + num_inference_steps: int - number of inference steps, + generator: openvino_genai.CppStdGenerator or class inherited from openvino_genai.Generator - random generator, + adapters: LoRA adapters, + strength: strength for image to image generation. 1.0f means initial image is fully noised, + max_sequence_length: int - length of t5_encoder_model input + + :return: ov.Tensor with resulting images + :rtype: ov.Tensor + """ + def get_generation_config(self) -> ImageGenerationConfig: + ... + def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None: + ... + def set_generation_config(self, generation_config: ImageGenerationConfig) -> None: + ... + def set_scheduler(self, scheduler: Scheduler) -> None: + ... class LLMPipeline: """ This class is used for generation with LLMs @@ -1367,9 +1511,14 @@ class Text2ImagePipeline: def stable_diffusion(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Text2ImagePipeline: ... @staticmethod + @typing.overload def stable_diffusion_3(scheduler: Scheduler, clip_text_model_1: CLIPTextModelWithProjection, clip_text_model_2: CLIPTextModelWithProjection, t5_encoder_model: T5EncoderModel, transformer: SD3Transformer2DModel, vae: AutoencoderKL) -> Text2ImagePipeline: ... @staticmethod + @typing.overload + def stable_diffusion_3(scheduler: Scheduler, clip_text_model_1: CLIPTextModelWithProjection, clip_text_model_2: CLIPTextModelWithProjection, transformer: SD3Transformer2DModel, vae: AutoencoderKL) -> Text2ImagePipeline: + ... + @staticmethod def stable_diffusion_xl(scheduler: Scheduler, clip_text_model: CLIPTextModel, clip_text_model_with_projection: CLIPTextModelWithProjection, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Text2ImagePipeline: ... @typing.overload diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index 64ea64feb0..7739b88ff9 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -10,6 +10,8 @@ #include #include "openvino/genai/image_generation/text2image_pipeline.hpp" +#include "openvino/genai/image_generation/image2image_pipeline.hpp" +#include "openvino/genai/image_generation/inpainting_pipeline.hpp" #include "tokenizers_path.hpp" #include "py_utils.hpp" @@ -173,7 +175,12 @@ void init_image_generation_pipelines(py::module_& m) { .def_static("stable_diffusion", &ov::genai::Text2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) .def_static("latent_consistency_model", &ov::genai::Text2ImagePipeline::latent_consistency_model, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) .def_static("stable_diffusion_xl", &ov::genai::Text2ImagePipeline::stable_diffusion_xl, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("clip_text_model_with_projection"), py::arg("unet"), py::arg("vae")) - .def_static("stable_diffusion_3", &ov::genai::Text2ImagePipeline::stable_diffusion_3, py::arg("scheduler"), py::arg("clip_text_model_1"), py::arg("clip_text_model_2"), py::arg("t5_encoder_model"), py::arg("transformer"), py::arg("vae")) + .def_static("stable_diffusion_3", py::overload_cast&, const ov::genai::CLIPTextModelWithProjection&, const ov::genai::CLIPTextModelWithProjection&, const ov::genai::T5EncoderModel&, + const ov::genai::SD3Transformer2DModel&, const ov::genai::AutoencoderKL&>(&ov::genai::Text2ImagePipeline::stable_diffusion_3), + py::arg("scheduler"), py::arg("clip_text_model_1"), py::arg("clip_text_model_2"), py::arg("t5_encoder_model"), py::arg("transformer"), py::arg("vae")) + .def_static("stable_diffusion_3", py::overload_cast&, const ov::genai::CLIPTextModelWithProjection&, const ov::genai::CLIPTextModelWithProjection&, + const ov::genai::SD3Transformer2DModel&, const ov::genai::AutoencoderKL&>(&ov::genai::Text2ImagePipeline::stable_diffusion_3), + py::arg("scheduler"), py::arg("clip_text_model_1"), py::arg("clip_text_model_2"), py::arg("transformer"), py::arg("vae")) .def_static("flux", &ov::genai::Text2ImagePipeline::flux, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("t5_encoder_model"), py::arg("transformer"), py::arg("vae")) .def( "compile", @@ -200,5 +207,139 @@ void init_image_generation_pipelines(py::module_& m) { }, py::arg("prompt"), "Input string", (text2image_generate_docstring + std::string(" \n ")).c_str()) - .def("decode", &ov::genai::Text2ImagePipeline::decode, py::arg("latent"));; + .def("decode", &ov::genai::Text2ImagePipeline::decode, py::arg("latent")); + + + auto image2image_pipeline = py::class_(m, "Image2ImagePipeline", "This class is used for generation with image-to-image models.") + .def(py::init([]( + const std::filesystem::path& models_path + ) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique(models_path); + }), + py::arg("models_path"), "folder with exported model files.", + R"( + Image2ImagePipeline class constructor. + models_path (os.PathLike): Path to the folder with exported model files. + )") + + .def(py::init([]( + const std::filesystem::path& models_path, + const std::string& device, + const py::kwargs& kwargs + ) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique(models_path, device, pyutils::kwargs_to_any_map(kwargs)); + }), + py::arg("models_path"), "folder with exported model files.", + py::arg("device"), "device on which inference will be done", + R"( + Image2ImagePipeline class constructor. + models_path (os.PathLike): Path with exported model files. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Image2ImagePipeline properties + )") + .def("get_generation_config", &ov::genai::Image2ImagePipeline::get_generation_config) + .def("set_generation_config", &ov::genai::Image2ImagePipeline::set_generation_config, py::arg("generation_config")) + .def("set_scheduler", &ov::genai::Image2ImagePipeline::set_scheduler, py::arg("scheduler")) + .def("reshape", &ov::genai::Image2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale")) + .def_static("stable_diffusion", &ov::genai::Image2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) + .def_static("latent_consistency_model", &ov::genai::Image2ImagePipeline::latent_consistency_model, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) + .def_static("stable_diffusion_xl", &ov::genai::Image2ImagePipeline::stable_diffusion_xl, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("clip_text_model_with_projection"), py::arg("unet"), py::arg("vae")) + .def( + "compile", + [](ov::genai::Image2ImagePipeline& pipe, + const std::string& device, + const py::kwargs& kwargs + ) { + pipe.compile(device, pyutils::kwargs_to_any_map(kwargs)); + }, + py::arg("device"), "device on which inference will be done", + R"( + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + )") + .def( + "generate", + [](ov::genai::Image2ImagePipeline& pipe, + const std::string& prompt, + const ov::Tensor& image, + const py::kwargs& kwargs + ) -> py::typing::Union { + ov::AnyMap params = pyutils::kwargs_to_any_map(kwargs); + return py::cast(pipe.generate(prompt, image, params)); + }, + py::arg("prompt"), "Input string", + py::arg("image"), "Initial image", + (text2image_generate_docstring + std::string(" \n ")).c_str()) + .def("decode", &ov::genai::Image2ImagePipeline::decode, py::arg("latent")); + + + auto inpainting_pipeline = py::class_(m, "InpaintingPipeline", "This class is used for generation with inpainting models.") + .def(py::init([]( + const std::filesystem::path& models_path + ) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique(models_path); + }), + py::arg("models_path"), "folder with exported model files.", + R"( + InpaintingPipeline class constructor. + models_path (os.PathLike): Path to the folder with exported model files. + )") + + .def(py::init([]( + const std::filesystem::path& models_path, + const std::string& device, + const py::kwargs& kwargs + ) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique(models_path, device, pyutils::kwargs_to_any_map(kwargs)); + }), + py::arg("models_path"), "folder with exported model files.", + py::arg("device"), "device on which inference will be done", + R"( + InpaintingPipeline class constructor. + models_path (os.PathLike): Path with exported model files. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: InpaintingPipeline properties + )") + .def("get_generation_config", &ov::genai::InpaintingPipeline::get_generation_config) + .def("set_generation_config", &ov::genai::InpaintingPipeline::set_generation_config, py::arg("generation_config")) + .def("set_scheduler", &ov::genai::InpaintingPipeline::set_scheduler, py::arg("scheduler")) + .def("reshape", &ov::genai::InpaintingPipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale")) + .def_static("stable_diffusion", &ov::genai::InpaintingPipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) + .def_static("latent_consistency_model", &ov::genai::InpaintingPipeline::latent_consistency_model, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) + .def_static("stable_diffusion_xl", &ov::genai::InpaintingPipeline::stable_diffusion_xl, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("clip_text_model_with_projection"), py::arg("unet"), py::arg("vae")) + .def( + "compile", + [](ov::genai::InpaintingPipeline& pipe, + const std::string& device, + const py::kwargs& kwargs + ) { + pipe.compile(device, pyutils::kwargs_to_any_map(kwargs)); + }, + py::arg("device"), "device on which inference will be done", + R"( + Compiles the model. + device (str): Device to run the model on (e.g., CPU, GPU). + kwargs: Device properties. + )") + .def( + "generate", + [](ov::genai::InpaintingPipeline& pipe, + const std::string& prompt, + const ov::Tensor& image, + const ov::Tensor& mask_image, + const py::kwargs& kwargs + ) -> py::typing::Union { + ov::AnyMap params = pyutils::kwargs_to_any_map(kwargs); + return py::cast(pipe.generate(prompt, image, mask_image, params)); + }, + py::arg("prompt"), "Input string", + py::arg("image"), "Initial image", + py::arg("mask_image"), "Mask image", + (text2image_generate_docstring + std::string(" \n ")).c_str()) + .def("decode", &ov::genai::InpaintingPipeline::decode, py::arg("latent")); }