diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index c18d62fc59..81b868028c 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -14,6 +14,7 @@ concurrency: env: l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz + l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu22_2024.5.0.dev20240830_x86_64.tgz m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/m_openvino_toolkit_macos_12_6_2024.5.0.dev20240830_x86_64.tgz w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip jobs: @@ -202,8 +203,7 @@ jobs: echo "Multi prompt" passed cpp-greedy_causal_lm-windows: - runs-on: windows-latest - if: ${{ false }} # TODO: fix Windows + runs-on: windows-2019-16-core env: PYTHONIOENCODING: "utf8" defaults: @@ -218,6 +218,8 @@ jobs: python-version: 3.9 - name: Configure Developer Command Prompt for Microsoft Visual C++ uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + toolset: 14.29 - run: curl --output ov.zip ${{ env.w_ov_link }} - run: unzip -d ov ov.zip - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" @@ -681,7 +683,7 @@ jobs: diff pred2.txt ref.txt echo "Chat sample python" passed - py-vlm_chat_sample-ubuntu: + visual_chat_sample-ubuntu: runs-on: ubuntu-22.04-16-cores steps: - uses: actions/checkout@v4 @@ -693,7 +695,7 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_u22_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Build app run: | @@ -707,10 +709,19 @@ jobs: python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/ wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 - - run: | + + - name: Run chat chat sample + run: > source ./ov/setupvars.sh - timeout 2m ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11 + && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11 <<< $'What is on the image?\nWhat is special on the image?' + - name: Run Python chat sample + run: | + source ./ov/setupvars.sh + export PYTHONPATH=./build/:$PYTHONPATH + printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt + timeout 120s python ./samples/python/vlm_chat_sample/vlm_chat_sample.py ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11 < input.txt > ./pred.txt || ( [[ $? -eq 124 ]] && \ + echo "Timeout reached, but it's excpected." ) cpp-continuous-batching-ubuntu: runs-on: ubuntu-20.04-8-cores @@ -756,8 +767,7 @@ jobs: timeout 200s ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 cpp-continuous-batching-windows: - runs-on: windows-latest - if: ${{ false }} # TODO: fix Windows + runs-on: windows-2019-16-core env: PYTHONIOENCODING: "utf8" defaults: @@ -772,6 +782,8 @@ jobs: python-version: 3.9 - name: Configure Developer Command Prompt for Microsoft Visual C++ uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + toolset: 14.29 - name: Install OpenVINO run: | curl --output ov.zip ${{ env.w_ov_link }} @@ -854,6 +866,7 @@ jobs: cpp-beam_search_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2, cpp-beam_search_causal_lm-notus-7b-v1, cpp-speculative_decoding_lm-ubuntu, cpp-prompt_lookup_decoding_lm-ubuntu, cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu, + visual_language_sample-ubuntu, cpp-continuous-batching-windows, cpp-continuous-batching-macos] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index cd31ae497d..a5c057a0ab 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -67,11 +67,10 @@ jobs: - name: Run app run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - ./build/samples/cpp/stable_diffusion/stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + ./build/samples/cpp/text2image/stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" lcm_dreamshaper_v7_cpp-windows: - runs-on: windows-latest - if: ${{ false }} # TODO: fix Windows + runs-on: windows-2019-16-core defaults: run: shell: pwsh @@ -89,6 +88,11 @@ jobs: mv ./tmp/*/* . popd + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + toolset: 14.29 + - name: Build app run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" @@ -116,9 +120,9 @@ jobs: optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16 - name: Run app - run: | + run: > . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - ./build/samples/cpp/stable_diffusion/Release/lcm_dreamshaper.exe ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + & "./build/samples/cpp/text2image/Release/stable_diffusion.exe ./models/lcm_dreamshaper_v7/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" Overall_Status: name: ci/gha_overall_status_lcm diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 45e6dc2941..be309c732d 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -40,11 +40,7 @@ jobs: python -m pip install --upgrade pip python -m pip install flake8 pytest black GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt - python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url -https://storage.openvinotoolkit.org/simple/wheels/nightly - GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt - GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }} - + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -74,6 +70,9 @@ https://storage.openvinotoolkit.org/simple/wheels/nightly python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./llm_bench/python/prompts/stable-diffusion.jsonl -d cpu -n 1 - name: WWB Tests run: | + GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt + pip install git+https://github.com/huggingface/optimum.git + GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }} python -m pytest llm_bench/python/who_what_benchmark/tests stateful: runs-on: ubuntu-20.04 @@ -86,13 +85,13 @@ https://storage.openvinotoolkit.org/simple/wheels/nightly run: | GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt python -m pip uninstall --yes openvino - python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url -https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful grep beam_idx pytorch/dldt/FP32/openvino_model.xml - name: WWB Tests run: | GIT_CLONE_PROTECTION_ACTIVE=false pip install -r llm_bench/python/who_what_benchmark/requirements.txt + pip install git+https://github.com/huggingface/optimum.git GIT_CLONE_PROTECTION_ACTIVE=false pip install llm_bench/python/who_what_benchmark/ pip install pytest python -m pytest llm_bench/python/who_what_benchmark/tests diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index c69287678d..ae6b7ce57b 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -63,15 +63,20 @@ jobs: run: | source openvino_sd_cpp/bin/activate optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format fp16 --task stable-diffusion models/dreamlike-art-dreamlike-anime-1.0/FP16 + wget -O ./models/soulcard.safetensors https://civitai.com/api/download/models/72591 - - name: Run app + - name: Run main app run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - ./build/samples/cpp/stable_diffusion/stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + ./build/samples/cpp/text2image/stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + + - name: Run LoRA app + run: | + source ${{ env.OV_INSTALL_DIR }}/setupvars.sh + ./build/samples/cpp/text2image/lora_stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7 stable_diffusion_1_5_cpp-windows: - runs-on: windows-latest - if: ${{ false }} # TODO: fix Windows + runs-on: windows-2019-16-core defaults: run: shell: pwsh @@ -89,6 +94,11 @@ jobs: mv ./tmp/*/* . popd + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + toolset: 14.29 + - name: Build app run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" @@ -114,11 +124,17 @@ jobs: run: | . "./openvino_sd_cpp/Scripts/Activate.ps1" optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike-art-dreamlike-anime-1.0/FP16 + Invoke-WebRequest -Uri 'https://civitai.com/api/download/models/72591' -OutFile 'models/soulcard.safetensors' - - name: Run app - run: | + - name: Run main app + run: > + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + & "./build/samples/cpp/text2image/Release/stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" + + - name: Run LoRA app + run: > . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - ./build/samples/cpp/stable_diffusion/Release/stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + & "./build/samples/cpp/text2image/Release/lora_stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'curly-haired unicorn in the forest, anime, line' ./models/soulcard.safetensors 0.7" Overall_Status: name: ci/gha_overall_status_stable_diffusion diff --git a/README.md b/README.md index f20ae1c268..6a7c325f69 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ It includes the following pipelines: 6. [multinomial_causal_lm](./samples/cpp/multinomial_causal_lm/README.md) 7. [prompt_lookup_decoding_lm](./samples/cpp/prompt_lookup_decoding_lm/README.md) 8. [speculative_decoding_lm](./samples/cpp/speculative_decoding_lm/README.md) -3. [Stable Diffuison and Latent Consistency Model (with LoRA) C++ image generation pipeline](./samples/cpp/stable_diffusion/README.md) +3. [Stable Diffuison and Latent Consistency Model (with LoRA) C++ image generation pipeline](./samples/cpp/text2image/README.md) ### Requirements diff --git a/llm_bench/python/README.md b/llm_bench/python/README.md index b49ad980ab..3ef58f113a 100755 --- a/llm_bench/python/README.md +++ b/llm_bench/python/README.md @@ -1,140 +1,165 @@ -# Benchmarking script for large language models +# Benchmarking Script for Large Language Models -This script provides a unified approach to estimate performance for Large Language Models. -It is based on pipelines provided by Optimum-Intel and allows to estimate performance for -pytorch and openvino models, using almost the same code and precollected models. +This script provides a unified approach to estimate performance for Large Language Models (LLMs). It leverages pipelines provided by Optimum-Intel and allows performance estimation for PyTorch and OpenVINO models using nearly identical code and pre-collected models. -## Usage -### 1. Start a Python virtual environment +### 1. Prepare Python Virtual Environment for LLM Benchmarking ``` bash -python3 -m venv python-env -source python-env/bin/activate +python3 -m venv ov-llm-bench-env +source ov-llm-bench-env/bin/activate pip install --upgrade pip -pip install -r requirements.txt + +git clone https://github.com/openvinotoolkit/openvino.genai.git +cd openvino.genai/llm_bench/python/ +pip install -r requirements.txt ``` -> Note: -> If you are using an existing python environment, recommend following command to use all the dependencies with latest versions: -> pip install -U --upgrade-strategy eager -r requirements.txt -### 2. Convert a model to OpenVINO IR - -The optimum-cli tool allows you to convert models from Hugging Face to the OpenVINO IR format. More detailed info about tool usage can be found in [Optimum Intel documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/export) +> Note: +> For existing Python environments, run the following command to ensure that all dependencies are installed with the latest versions: +> `pip install -U --upgrade-strategy eager -r requirements.txt` -Prerequisites: -install conversion dependencies using `requirements.txt` +#### (Optional) Hugging Face Login : -Usage: +Login to Hugging Face if you want to use non-public models: ```bash -optimum-cli export openvino --model --weight-format +huggingface-cli login ``` -Paramters: -* `--model ` - model_id for downloading from huggngface_hub (https://huggingface.co/models) or path with directory where pytorch model located. -* `--weight-format` - precision for model conversion fp32, fp16, int8, int4 -* `` - output directory for saving OpenVINO model. +### 2. Convert Model to OpenVINO IR Format + +The `optimum-cli` tool simplifies converting Hugging Face models to OpenVINO IR format. +- Detailed documentation can be found in the [Optimum-Intel documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/export). +- To learn more about weight compression, see the [NNCF Weight Compression Guide](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html). +- For additional guidance on running inference with OpenVINO for LLMs, see the [OpenVINO LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html). -Usage example: -```bash -optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format fp16 models/llama-2-7b-chat -``` +**Usage:** -the result of running the command will have the following file structure: +```bash +optimum-cli export openvino --model --weight-format - |-llama-2-7b-chat - |-pytorch - |-dldt - |-FP16 - |-openvino_model.xml - |-openvino_model.bin - |-config.json - |-generation_config.json - |-tokenizer_config.json - |-tokenizer.json - |-tokenizer.model - |-special_tokens_map.json +optimum-cli export openvino -h # For detailed information +``` -### 3. Benchmarking +* `--model ` : model_id for downloading from [huggngface_hub](https://huggingface.co/models) or path with directory where pytorch model located. +* `--weight-format ` : precision for model conversion. Available options: `fp32, fp16, int8, int4, mxfp4` +* ``: output directory for saving generated OpenVINO model. -Prerequisites: -install benchmarking dependencies using `requirements.txt` +**NOTE:** +- Models larger than 1 billion parameters are exported to the OpenVINO format with 8-bit weights by default. You can disable it with `--weight-format fp32`. -``` bash -pip install -r requirements.txt +**Example:** +```bash +optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format fp16 models/llama-2-7b-chat ``` -note: **You can specify the installed OpenVINO version through pip install** -``` bash -# e.g. -pip install openvino==2023.3.0 +**Resulting file structure:** + +```console + models + └── llama-2-7b-chat + ├── config.json + ├── generation_config.json + ├── openvino_detokenizer.bin + ├── openvino_detokenizer.xml + ├── openvino_model.bin + ├── openvino_model.xml + ├── openvino_tokenizer.bin + ├── openvino_tokenizer.xml + ├── special_tokens_map.json + ├── tokenizer_config.json + ├── tokenizer.json + └── tokenizer.model ``` -### 4. Run the following command to test the performance of one LLM model +### 3. Benchmark LLM Model + +To benchmark the performance of the LLM, use the following command: + ``` bash python benchmark.py -m -d -r -f -p -n # e.g. -python benchmark.py -m models/llama-2-7b-chat/pytorch/dldt/FP32 -n 2 -python benchmark.py -m models/llama-2-7b-chat/pytorch/dldt/FP32 -p "What is openvino?" -n 2 -python benchmark.py -m models/llama-2-7b-chat/pytorch/dldt/FP32 -pf prompts/llama-2-7b-chat_l.jsonl -n 2 +python benchmark.py -m models/llama-2-7b-chat/ -n 2 +python benchmark.py -m models/llama-2-7b-chat/ -p "What is openvino?" -n 2 +python benchmark.py -m models/llama-2-7b-chat/ -pf prompts/llama-2-7b-chat_l.jsonl -n 2 ``` -Parameters: -* `-m` - model path -* `-d` - inference device (default=cpu) -* `-r` - report csv -* `-f` - framework (default=ov) -* `-p` - interactive prompt text -* `-pf` - path of JSONL file including interactive prompts -* `-n` - number of benchmarking iterations, if the value greater 0, will exclude the first iteration. (default=0) -* `-ic` - limit the output token size (default 512) of text_gen and code_gen models. - +**Parameters:** +- `-m`: Path to the model. +- `-d`: Inference device (default: CPU). +- `-r`: Path to the CSV report. +- `-f`: Framework (default: ov). +- `-p`: Interactive prompt text. +- `-pf`: Path to a JSONL file containing prompts. +- `-n`: Number of iterations (default: 0, the first iteration is excluded). +- `-ic`: Limit the output token size (default: 512) for text generation and code generation models. + +**Additional options:** ``` bash python ./benchmark.py -h # for more information ``` -## Running `torch.compile()` +#### Benchmarking the Original PyTorch Model: +To benchmark the original PyTorch model, first download the model locally and then run benchmark by specifying PyTorch as the framework with parameter `-f pt` -The option `--torch_compile_backend` uses `torch.compile()` to speed up -the PyTorch code by compiling it into optimized kernels using a selected backend. +```bash +# Download PyTorch Model +huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir models/llama-2-7b-chat/pytorch +# Benchmark with PyTorch Framework +python benchmark.py -m models/llama-2-7b-chat/pytorch -n 2 -f pt +``` -Prerequisites: install benchmarking dependencies using requirements.txt +> **Note:** If needed, You can install a specific OpenVINO version using pip: +> ``` bash +> # e.g. +> pip install openvino==2024.4.0 +> # Optional, install the openvino nightly package if needed. +> # OpenVINO nightly is pre-release software and has not undergone full release validation or qualification. +> pip uninstall openvino +> pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly +> ``` -``` bash -pip install -r requirements.txt -``` +## 4. Benchmark LLM with `torch.compile()` + +The `--torch_compile_backend` option enables you to use `torch.compile()` to accelerate PyTorch models by compiling them into optimized kernels using a specified backend. -In order to run the `torch.compile()` on CUDA GPU, install additionally the nightly PyTorch version: +Before benchmarking, you need to download the original PyTorch model. Use the following command to download the model locally: ```bash -pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 +huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir models/llama-2-7b-chat/pytorch ``` -Add the option `--torch_compile_backend` with the desired backend: `pytorch` or `openvino` (default) while running the benchmarking script: +To run the benchmarking script with `torch.compile()`, use the `--torch_compile_backend` option to specify the backend. You can choose between `pytorch` or `openvino` (default). Example: ```bash python ./benchmark.py -m models/llama-2-7b-chat/pytorch -d CPU --torch_compile_backend openvino ``` -## Run on 2 sockets platform +> **Note:** To use `torch.compile()` with CUDA GPUs, you need to install the nightly version of PyTorch: +> +> ```bash +> pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 +> ``` + -benchmark.py sets openvino.properties.streams.num(1) by default +## 5. Running on 2-Socket Platforms -| OpenVINO version | Behaviors | +The benchmarking script sets `openvino.properties.streams.num(1)` by default. For multi-socket platforms, use `numactl` on Linux or the `--load_config` option to modify behavior. + +| OpenVINO Version | Behaviors | |:--------------------|:------------------------------------------------| -| Before 2024.0.0 | streams.num(1)
execute on 2 sockets. | -| 2024.0.0 | streams.num(1)
execute on the same socket as the APP is running on. | +| Before 2024.0.0 | streams.num(1)
execute on 2 sockets. | +| 2024.0.0 | streams.num(1)
execute on the same socket as the APP is running on. | -numactl on Linux or --load_config for benchmark.py can be used to change the behaviors. +For example, `--load_config config.json` as following will result in streams.num(1) and execute on 2 sockets. +```json +{ + "INFERENCE_NUM_THREADS": +} +``` +`` is the number of total physical cores in 2 sockets. -For example, --load_config config.json as following in OpenVINO 2024.0.0 will result in streams.num(1) and execute on 2 sockets. -``` -{"INFERENCE_NUM_THREADS":} -``` -`` is the number of total physical cores in 2 sockets +## 6. Additional Resources -## Additional Resources -### 1. NOTE -> If you encounter any errors, please check **[NOTES.md](./doc/NOTES.md)** which provides solutions to the known errors. -### 2. Image generation -> To configure more parameters for image generation models, reference to **[IMAGE_GEN.md](./doc/IMAGE_GEN.md)** +- **Error Troubleshooting:** Check the [NOTES.md](./doc/NOTES.md) for solutions to known issues. +- **Image Generation Configuration:** Refer to [IMAGE_GEN.md](./doc/IMAGE_GEN.md) for setting parameters for image generation models. \ No newline at end of file diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index 7fb6d1757b..9dcfe74f66 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -308,13 +308,14 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " f"is different from md5 of the {num - 1} iteration {prev_md5}") llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) - if num == 1: - # if the device is CPU, throw exception - if args['devices'].lower().startswith('cpu') is True: + if not args.get("use_cb", False): + if num == 1: + # if the device is CPU, throw exception + if args['devices'].lower().startswith('cpu') is True: + assert (result_md5_list == prev_md5) + else: + # throw exception assert (result_md5_list == prev_md5) - else: - # throw exception - assert (result_md5_list == prev_md5) else: llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) @@ -814,7 +815,7 @@ def get_argprser(): llm_bench_utils.model_utils.add_stateful_model_arguments(parser) parser.add_argument("--genai", action="store_true", help="Use OpenVINO GenAI optimized pipelines for benchmarking") parser.add_argument("--use_cb", action="store_true", help="Use Continuous Batching inference mode") - parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings") + parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings or dict") parser.add_argument( '--end_token_stopping', action='store_true', diff --git a/llm_bench/python/convert.py b/llm_bench/python/convert.py index ae676bc269..49cea02c11 100644 --- a/llm_bench/python/convert.py +++ b/llm_bench/python/convert.py @@ -1464,6 +1464,8 @@ def main(): add_stateful_model_arguments(parser) args = parser.parse_args() + log.warning("[DEPRECATED] Not for production use! Please use the 'optimum-intel' to generate the IRs. For details, please check:" + " https://github.com/openvinotoolkit/openvino.genai/blob/master/llm_bench/python/README.md#2-convert-model-to-openvino-ir-format") log.info(f"openvino runtime version: {get_version()}") model_type = get_convert_model_type(args.model_id.lower()) converter = converters[model_type] diff --git a/llm_bench/python/llm_bench_utils/model_utils.py b/llm_bench/python/llm_bench_utils/model_utils.py index b35d7be47b..3d5359e26c 100644 --- a/llm_bench/python/llm_bench_utils/model_utils.py +++ b/llm_bench/python/llm_bench_utils/model_utils.py @@ -204,11 +204,17 @@ def get_use_case(model_name_or_path): def get_config(config): - with open(config, 'r') as f: + if Path(config).is_file(): + with open(config, 'r') as f: + try: + ov_config = json.load(f) + except Exception: + raise RuntimeError(f'==Parse file:{config} failiure, json format is incorrect ==') + else: try: - ov_config = json.load(f) + ov_config = json.loads(config) except Exception: - raise RuntimeError(f'==Parse file:{config} failiure, json format is incorrect ==') + raise RuntimeError(f'==Parse config:{config} failiure, json format is incorrect ==') return ov_config diff --git a/llm_bench/python/llm_bench_utils/ov_utils.py b/llm_bench/python/llm_bench_utils/ov_utils.py index b9434c5f3d..da77f5da22 100644 --- a/llm_bench/python/llm_bench_utils/ov_utils.py +++ b/llm_bench/python/llm_bench_utils/ov_utils.py @@ -189,11 +189,14 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): cb = kwargs.get("use_cb", False) if cb: log.info("Continuous Batching mode activated") + default_cb_config = {"cache_size": 1} + if "GPU" in device: + default_cb_config["block_size"] = 16 scheduler_config = openvino_genai.SchedulerConfig() - scheduler_params = kwargs.get("cb_config") or {"cache_size": 1} + scheduler_params = kwargs.get("cb_config") or default_cb_config if scheduler_params: log.info(f"Scheduler parameters:\n{scheduler_params}") - + for param, value in scheduler_params.items(): setattr(scheduler_config, param, value) ov_config["scheduler_config"] = scheduler_config @@ -209,19 +212,24 @@ def __init__(self, tokenizer): self.token_generation_time = [] self.generated_tokens = [] self.start_time = time.perf_counter() + def put(self, token_id): self.token_generation_time.append(time.perf_counter() - self.start_time) self.generated_tokens.append(token_id) self.start_time = time.perf_counter() return False + def reset(self): self.token_generation_time = [] self.generated_tokens = [] self.start_time = time.perf_counter() + def end(self): pass + def get_tokens(self): return self.generated_tokens + def get_time_list(self): return self.token_generation_time streamer = TokenStreamer(llm_pipe.get_tokenizer()) if cb else None diff --git a/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py b/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py index 12fc726f38..5781ddf229 100644 --- a/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py +++ b/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py @@ -6,7 +6,13 @@ from whowhatbench.wwb import load_dataset from optimum.intel.openvino import OVModelForCausalLM -from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationConfig, CacheEvictionConfig, AggregationMode +from openvino_genai import ( + ContinuousBatchingPipeline, + SchedulerConfig, + GenerationConfig, + CacheEvictionConfig, + AggregationMode, +) from openvino_tokenizers import convert_tokenizer from openvino import serialize @@ -18,12 +24,16 @@ MAX_SEQUENCES = 100 -model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) +model = OVModelForCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=True +) tokenizer = AutoTokenizer.from_pretrained(model_id) model_path = PosixPath(tempfile.gettempdir()) / model_id model.save_pretrained(model_path) -ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True) +ov_tokenizer, ov_detokenizer = convert_tokenizer( + tokenizer, with_detokenizer=True, skip_special_tokens=True +) serialize(ov_tokenizer, model_path / "openvino_tokenizer.xml") serialize(ov_detokenizer, model_path / "openvino_detokenizer.xml") @@ -48,24 +58,39 @@ generation_config.num_return_sequences = 1 generation_config.max_new_tokens = MAX_NEW_TOKENS -data = load_dataset(path='squad', name=None, split='validation')["context"] -data_dict = {"questions": list(dict({k: None for k in data}).keys())[:MAX_SEQUENCES]} +data = load_dataset(path="squad", name=None, split="validation")["context"] +data_dict = {"prompts": list(dict({k: None for k in data}).keys())[:MAX_SEQUENCES]} -model_cb_noopt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_noopt, "CPU", {}) -model_cb_opt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {}) +model_cb_noopt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config_noopt, "CPU", {} +) +model_cb_opt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {} +) -GT_DATA_FILE = 'gt_data.csv' +GT_DATA_FILE = "gt_data.csv" if os.path.exists(GT_DATA_FILE): - evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, gt_data=GT_DATA_FILE, tokenizer=tokenizer, - test_data=data_dict, generation_config=generation_config, - max_new_tokens=MAX_NEW_TOKENS, seqs_per_request=3) + evaluator = whowhatbench.TextEvaluator( + base_model=model_cb_noopt, + gt_data=GT_DATA_FILE, + tokenizer=tokenizer, + test_data=data_dict, + generation_config=generation_config, + max_new_tokens=MAX_NEW_TOKENS, + seqs_per_request=3, + ) else: - evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict, - generation_config=generation_config, max_new_tokens=MAX_NEW_TOKENS, - seqs_per_request=3) - evaluator.dump_gt('gt_data.csv') + evaluator = whowhatbench.TextEvaluator( + base_model=model_cb_noopt, + tokenizer=tokenizer, + test_data=data_dict, + generation_config=generation_config, + max_new_tokens=MAX_NEW_TOKENS, + seqs_per_request=3, + ) + evaluator.dump_gt("gt_data.csv") all_metrics_per_question, all_metrics = evaluator.score(model_cb_opt) @@ -89,8 +114,18 @@ pipeline_opt_metrics = model_cb_opt.get_metrics() pipeline_noopt_metrics = model_cb_noopt.get_metrics() -print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}") -print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}") -max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage) -avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage) -print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x") +print( + f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}" +) +print( + f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}" +) +max_optimization_ratio = ( + pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage +) +avg_optimization_ratio = ( + pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage +) +print( + f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x" +) diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli_image.py b/llm_bench/python/who_what_benchmark/tests/test_cli_image.py new file mode 100644 index 0000000000..f4c10eac86 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/tests/test_cli_image.py @@ -0,0 +1,98 @@ +import subprocess # nosec B404 +import os +import shutil +import pytest +import logging + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def run_wwb(args): + logger.info(" ".join(["wwb"] + args)) + result = subprocess.run(["wwb"] + args, capture_output=True, text=True) + logger.info(result) + return result + + +@pytest.mark.parametrize( + ("model_id", "model_type", "backend"), + [ + ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "hf"), + ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "openvino"), + ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "sd-xl", "hf"), + ], +) +def test_image_model_types(model_id, model_type, backend): + GT_FILE = "test_sd.json" + wwb_args = [ + "--base-model", + model_id, + "--target-model", + model_id, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + ] + if backend == "hf": + wwb_args.append("--hf") + + result = run_wwb(wwb_args) + print(f"WWB result: {result}, {result.stderr}") + + try: + os.remove(GT_FILE) + except OSError: + pass + shutil.rmtree("reference", ignore_errors=True) + shutil.rmtree("target", ignore_errors=True) + + assert result.returncode == 0 + assert "Metrics for model" in result.stderr + assert "## Reference text" not in result.stderr + + +@pytest.mark.parametrize( + ("model_id", "model_type", "backend"), + [ + ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "hf"), + ], +) +def test_image_custom_dataset(model_id, model_type, backend): + GT_FILE = "test_sd.json" + wwb_args = [ + "--base-model", + model_id, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--dataset", + "google-research-datasets/conceptual_captions", + "--dataset-field", + "caption", + ] + if backend == "hf": + wwb_args.append("--hf") + + result = run_wwb(wwb_args) + + assert os.path.exists(GT_FILE) + + try: + os.remove(GT_FILE) + except OSError: + pass + shutil.rmtree("reference", ignore_errors=True) + + assert result.returncode == 0 diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli.py b/llm_bench/python/who_what_benchmark/tests/test_cli_text.py similarity index 51% rename from llm_bench/python/who_what_benchmark/tests/test_cli.py rename to llm_bench/python/who_what_benchmark/tests/test_cli_text.py index 8110e98335..161a9afb72 100644 --- a/llm_bench/python/who_what_benchmark/tests/test_cli.py +++ b/llm_bench/python/who_what_benchmark/tests/test_cli_text.py @@ -16,11 +16,7 @@ def run_wwb(args): logger.info(" ".join(["wwb"] + args)) - result = subprocess.run( - ["wwb"] + args, - capture_output=True, - text=True - ) + result = subprocess.run(["wwb"] + args, capture_output=True, text=True) logger.info(result) return result @@ -54,13 +50,21 @@ def teardown_module(): shutil.rmtree(tmp_dir) -def test_target_model(): - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU" - ]) +def test_text_target_model(): + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--model-type", + "text", + ] + ) assert result.returncode == 0 assert "Metrics for model" in result.stderr @@ -68,19 +72,28 @@ def test_target_model(): @pytest.fixture -def test_gt_data(): +def test_text_gt_data(): with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: temp_file_name = tmpfile.name - result = run_wwb([ - "--base-model", base_model_path, - "--gt-data", temp_file_name, - "--dataset", "EleutherAI/lambada_openai,en", - "--dataset-field", "text", - "--split", "test", - "--num-samples", "2", - "--device", "CPU" - ]) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--gt-data", + temp_file_name, + "--dataset", + "EleutherAI/lambada_openai,en", + "--dataset-field", + "text", + "--split", + "test", + "--num-samples", + "2", + "--device", + "CPU", + ] + ) data = pd.read_csv(temp_file_name) os.remove(temp_file_name) @@ -88,76 +101,107 @@ def test_gt_data(): assert len(data["questions"].values) == 2 -def test_output_directory(): +def test_text_output_directory(): with tempfile.TemporaryDirectory() as temp_dir: - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU", - "--output", temp_dir - ]) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--output", + temp_dir, + ] + ) assert result.returncode == 0 assert "Metrics for model" in result.stderr assert os.path.exists(os.path.join(temp_dir, "metrics_per_qustion.csv")) assert os.path.exists(os.path.join(temp_dir, "metrics.csv")) -def test_verbose(): - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU", - "--verbose" - ]) +def test_text_verbose(): + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--verbose", + ] + ) assert result.returncode == 0 assert "## Diff " in result.stderr -def test_language_autodetect(): +def test_text_language_autodetect(): with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: temp_file_name = tmpfile.name - result = run_wwb([ - "--base-model", "Qwen/Qwen2-0.5B", - "--gt-data", temp_file_name, - "--num-samples", "2", - "--device", "CPU" - ]) + result = run_wwb( + [ + "--base-model", + "Qwen/Qwen2-0.5B", + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + ] + ) data = pd.read_csv(temp_file_name) os.remove(temp_file_name) assert result.returncode == 0 - assert "马克" in data["questions"].values[0] + assert "马克" in data["prompts"].values[0] -def test_hf_model(): +def test_text_hf_model(): with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: temp_file_name = tmpfile.name - result = run_wwb([ - "--base-model", model_id, - "--gt-data", temp_file_name, - "--num-samples", "2", - "--device", "CPU", - "--hf" - ]) + result = run_wwb( + [ + "--base-model", + model_id, + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + "--hf", + ] + ) data = pd.read_csv(temp_file_name) os.remove(temp_file_name) assert result.returncode == 0 - assert len(data["questions"].values) == 2 - - -def test_genai_model(): - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU", - "--genai" - ]) + assert len(data["prompts"].values) == 2 + + +def test_text_genai_model(): + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--genai", + ] + ) assert result.returncode == 0 assert "Metrics for model" in result.stderr assert "## Reference text" not in result.stderr diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py b/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py index 86f428ddd7..4d61b0d086 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py @@ -1,4 +1,13 @@ -"""Who what benchmark APIs.""" -from .evaluator import Evaluator +from .registry import register_evaluator, MODELTYPE2TASK, EVALUATOR_REGISTRY +from .text_evaluator import TextEvaluator +from .text_evaluator import TextEvaluator as Evaluator +from .text2image_evaluator import Text2ImageEvaluator -__all__ = ["Evaluator"] +__all__ = [ + "Evaluator", + "register_evaluator", + "TextEvaluator", + "Text2ImageEvaluator", + "MODELTYPE2TASK", + "EVALUATOR_REGISTRY", +] diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/registry.py b/llm_bench/python/who_what_benchmark/whowhatbench/registry.py new file mode 100644 index 0000000000..208ba60ff3 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/registry.py @@ -0,0 +1,50 @@ +from abc import ABC, abstractmethod + +from optimum.intel import ( + OVLatentConsistencyModelPipeline, + OVStableDiffusionPipeline, + OVStableDiffusionXLPipeline, +) + + +# Registry for evaluators +EVALUATOR_REGISTRY = {} +MODELTYPE2TASK = { + "text": "text-generation", + "sd": "image-generation", + "sd-xl": "image-generation", + "sd-lcm": "image-generation", +} + +TEXT2IMAGE_TASK2CLASS = { + "sd": OVStableDiffusionPipeline, + "sd-xl": OVStableDiffusionXLPipeline, + "sd-lcm": OVLatentConsistencyModelPipeline, +} + + +def register_evaluator(*names): + def decorate(cls): + for name in names: + assert ( + name not in EVALUATOR_REGISTRY + ), f"Evaluator named '{name}' conflicts with existing evaluators! Please register with a non-conflicting alias instead." + + EVALUATOR_REGISTRY[name] = cls + return cls + + return decorate + + +class BaseEvaluator(ABC): + @abstractmethod + def dump_gt(self, csv_name: str): + pass + + @abstractmethod + def score(self, model, **kwargs): + pass + + @abstractmethod + def worst_examples(self, top_k: int = 5, metric="similarity"): + pass diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py b/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py new file mode 100644 index 0000000000..b8b8234547 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py @@ -0,0 +1,157 @@ +import os +from typing import Any, Union + +import pandas as pd +from tqdm import tqdm +from transformers import set_seed +import torch + +from .registry import register_evaluator, BaseEvaluator + +from .whowhat_metrics import ImageSimilarity + +default_data = { + "prompts": [ + "Cinematic, a vibrant Mid-century modern dining area, colorful chairs and a sideboard, ultra realistic, many detail", + "colibri flying near a flower, side view, forest background, natural light, photorealistic, 4k", + "Illustration of an astronaut sitting in outer space, moon behind him", + "A vintage illustration of a retro computer, vaporwave aesthetic, light pink and light blue", + "A view from beautiful alien planet, very beautiful, surealism, retro astronaut on the first plane, 8k photo", + ], +} + + +@register_evaluator("image-generation") +class Text2ImageEvaluator(BaseEvaluator): + def __init__( + self, + base_model: Any = None, + gt_data: str = None, + test_data: Union[str, list] = None, + metrics="similarity", + similarity_model_id: str = "openai/clip-vit-large-patch14", + resolution=(512, 512), + num_inference_steps=4, + crop_prompts=True, + num_samples=None, + gen_image_fn=None, + seed=42, + ) -> None: + assert ( + base_model is not None or gt_data is not None + ), "Text generation pipeline for evaluation or ground trush data must be defined" + + self.test_data = test_data + self.metrics = metrics + self.resolution = resolution + self.crop_prompt = crop_prompts + self.num_samples = num_samples + self.num_inference_steps = num_inference_steps + self.seed = seed + self.similarity = None + self.similarity = ImageSimilarity(similarity_model_id) + self.last_cmp = None + self.gt_dir = os.path.dirname(gt_data) + if base_model: + self.gt_data = self._generate_data( + base_model, gen_image_fn, os.path.join(self.gt_dir, "reference") + ) + else: + self.gt_data = pd.read_csv(gt_data, keep_default_na=False) + + def dump_gt(self, csv_name: str): + self.gt_data.to_csv(csv_name) + + def score(self, model, gen_image_fn=None): + predictions = self._generate_data( + model, gen_image_fn, os.path.join(self.gt_dir, "target") + ) + + all_metrics_per_prompt = {} + all_metrics = {} + + if self.similarity: + metric_dict, metric_per_question = self.similarity.evaluate( + self.gt_data, predictions + ) + all_metrics.update(metric_dict) + all_metrics_per_prompt.update(metric_per_question) + + self.last_cmp = all_metrics_per_prompt + self.last_cmp["prompts"] = predictions["prompts"].values + self.last_cmp["source_model"] = self.gt_data["images"].values + self.last_cmp["optimized_model"] = predictions["images"].values + self.last_cmp = pd.DataFrame(self.last_cmp) + + return pd.DataFrame(all_metrics_per_prompt), pd.DataFrame([all_metrics]) + + def worst_examples(self, top_k: int = 5, metric="similarity"): + assert self.last_cmp is not None + + res = self.last_cmp.nsmallest(top_k, metric) + res = list(row for idx, row in res.iterrows()) + + return res + + def _generate_data(self, model, gen_image_fn=None, image_dir="reference"): + if hasattr(model, "reshape") and self.resolution is not None: + model.reshape( + batch_size=1, + height=self.resolution[0], + width=self.resolution[1], + num_images_per_prompt=1, + ) + + def default_gen_image_fn(model, prompt, num_inference_steps, generator=None): + output = model( + prompt, + num_inference_steps=num_inference_steps, + output_type="pil", + width=self.resolution[0], + height=self.resolution[0], + generator=generator, + ) + return output.images[0] + + gen_image_fn = gen_image_fn or default_gen_image_fn + + if self.test_data: + if isinstance(self.test_data, str): + data = pd.read_csv(self.test_data) + else: + if isinstance(self.test_data, dict): + assert "prompts" in self.test_data + data = dict(self.test_data) + else: + data = {"prompts": list(self.test_data)} + data = pd.DataFrame.from_dict(data) + else: + data = pd.DataFrame.from_dict(default_data) + + prompts = data["prompts"] + prompts = ( + prompts.values + if self.num_samples is None + else prompts.values[: self.num_samples] + ) + images = [] + rng = torch.Generator(device="cpu") + + if not os.path.exists(image_dir): + os.makedirs(image_dir) + for i, prompt in tqdm(enumerate(prompts), desc="Evaluate pipeline"): + set_seed(self.seed) + image = gen_image_fn( + model, + prompt, + self.num_inference_steps, + generator=rng.manual_seed(self.seed), + ) + image_path = os.path.join(image_dir, f"{i}.png") + image.save(image_path) + images.append(image_path) + + res_data = {"prompts": list(prompts), "images": images} + df = pd.DataFrame(res_data) + + return df diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py b/llm_bench/python/who_what_benchmark/whowhatbench/text_evaluator.py similarity index 72% rename from llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py rename to llm_bench/python/who_what_benchmark/whowhatbench/text_evaluator.py index bb0d17e34e..a6453cb66f 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/text_evaluator.py @@ -3,11 +3,12 @@ import pandas as pd from tqdm import tqdm -from .whowhat_metrics import DivergencyMetric, SimilarityMetric +from .registry import register_evaluator, BaseEvaluator +from .whowhat_metrics import TextDivergency, TextSimilarity default_data = { - "en" : { - "questions": [ + "en": { + "prompts": [ "Who is Mark Twain?", "Who is William Shakespeare?", "Who is Agatha Christie?", @@ -38,12 +39,12 @@ ], }, "cn": { - "questions": [ + "prompts": [ "马克吐温是谁?", "谁是威廉-莎士比亚?", "阿加莎-克里斯蒂是谁?", "芭芭拉-卡特兰是谁?", - "丹妮尔-斯蒂尔是谁?" + "丹妮尔-斯蒂尔是谁?", "谁是哈罗德-罗宾斯?", "乔治-西默农是谁?", "伊妮德-布莱顿是谁?", @@ -86,7 +87,10 @@ def autodetect_language(model): return model2language.get(model.config.model_type, "en") -class Evaluator: +@register_evaluator( + "text-generation", "text-generation-with-past", "text2text-generation" +) +class TextEvaluator(BaseEvaluator): def __init__( self, base_model: Any = None, @@ -102,7 +106,7 @@ def __init__( gen_answer_fn=None, generation_config=None, generation_config_base=None, - seqs_per_request=None + seqs_per_request=None, ) -> None: assert ( base_model is not None or gt_data is not None @@ -127,7 +131,9 @@ def __init__( self.language = autodetect_language(base_model) if base_model: - self.gt_data = self._generate_data(base_model, gen_answer_fn, generation_config=generation_config) + self.gt_data = self._generate_data( + base_model, gen_answer_fn, generation_config=generation_config + ) else: self.gt_data = pd.read_csv(gt_data, keep_default_na=False) @@ -138,10 +144,10 @@ def __init__( self.similarity = None self.divergency = None if "similarity" in self.metrics: - self.similarity = SimilarityMetric(similarity_model_id) + self.similarity = TextSimilarity(similarity_model_id) if "divergency" in self.metrics: assert tokenizer is not None - self.divergency = DivergencyMetric(tokenizer) + self.divergency = TextDivergency(tokenizer) self.last_cmp = None @@ -151,7 +157,7 @@ def dump_gt(self, csv_name: str): def score(self, model, gen_answer_fn=None): predictions = self._generate_data(model, gen_answer_fn, self.generation_config) - all_metrics_per_question = {} + all_metrics_per_prompt = {} all_metrics = {} if self.similarity: @@ -159,23 +165,23 @@ def score(self, model, gen_answer_fn=None): self.gt_data, predictions ) all_metrics.update(metric_dict) - all_metrics_per_question.update(metric_per_question) + all_metrics_per_prompt.update(metric_per_question) if self.divergency: metric_dict, metric_per_question = self.divergency.evaluate( self.gt_data, predictions ) all_metrics.update(metric_dict) - all_metrics_per_question.update(metric_per_question) + all_metrics_per_prompt.update(metric_per_question) - self.last_cmp = all_metrics_per_question - self.last_cmp["questions"] = predictions["questions"].values + self.last_cmp = all_metrics_per_prompt + self.last_cmp["prompts"] = predictions["prompts"].values self.last_cmp["source_model"] = self.gt_data["answers"].values self.last_cmp["optimized_model"] = predictions["answers"].values self.last_cmp = pd.DataFrame(self.last_cmp) - self.last_cmp.rename(columns={"questions": "prompt"}, inplace=True) + self.last_cmp.rename(columns={"prompts": "prompt"}, inplace=True) - return pd.DataFrame(all_metrics_per_question), pd.DataFrame([all_metrics]) + return pd.DataFrame(all_metrics_per_prompt), pd.DataFrame([all_metrics]) def worst_examples(self, top_k: int = 5, metric="similarity"): assert self.last_cmp is not None @@ -190,12 +196,12 @@ def worst_examples(self, top_k: int = 5, metric="similarity"): return res def _generate_data(self, model, gen_answer_fn=None, generation_config=None): - def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question): - inputs = self.tokenizer(question, return_tensors="pt") + def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question): + inputs = self.tokenizer(prompt, return_tensors="pt") tokens = model.generate(**inputs, max_new_tokens=max_new_tokens) out = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0] - return out[len(question) :] if crop_question else out + return out[len(prompt) :] if crop_question else out gen_answer_fn = gen_answer_fn or default_gen_answer @@ -204,39 +210,58 @@ def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question data = pd.read_csv(self.test_data) else: if isinstance(self.test_data, dict): - assert "questions" in self.test_data + assert "prompts" in self.test_data data = dict(self.test_data) else: - data = {"questions": list(self.test_data)} + data = {"prompts": list(self.test_data)} data = pd.DataFrame.from_dict(data) else: if self.language is None: - print("No language detecting in the base model or ground truth data. Taking language from target model.") + print( + "No language detecting in the base model or ground truth data. Taking language from target model." + ) self.language = autodetect_language(model) data = pd.DataFrame.from_dict(default_data[self.language]) - questions = data["questions"] + prompt_data = data["prompts"] answers = [] - prompts = questions.values if self.num_samples is None else questions.values[:self.num_samples] + prompts = ( + prompt_data.values + if self.num_samples is None + else prompt_data.values[: self.num_samples] + ) if generation_config is None: - for q in tqdm(prompts, desc="Evaluate pipeline"): - answers.append(gen_answer_fn(model, self.tokenizer, q, self.max_new_tokens, self._crop_question)) + for p in tqdm(prompts, desc="Evaluate pipeline"): + answers.append( + gen_answer_fn( + model, + self.tokenizer, + p, + self.max_new_tokens, + self._crop_question, + ) + ) else: - with tqdm(total=len(questions.values)) as progress_bar: + with tqdm(total=len(prompt_data.values)) as progress_bar: batch = [] - for q_idx, q in enumerate(questions.values): + for p_idx, p in enumerate(prompt_data.values): progress_bar.update(1) - batch.append(q) - if len(batch) == self.seqs_per_request or q_idx == len(questions.values) - 1: - ans_batch = model.generate(batch, [generation_config] * len(batch)) + batch.append(p) + if ( + len(batch) == self.seqs_per_request + or p_idx == len(prompt_data.values) - 1 + ): + ans_batch = model.generate( + batch, [generation_config] * len(batch) + ) for ans in ans_batch: answers.append(ans.m_generation_ids[0]) batch.clear() - res_data = {"questions": list(prompts), "answers": answers} + res_data = {"prompts": list(prompts), "answers": answers} df = pd.DataFrame(res_data) df["language"] = self.language diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py b/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py index 83157e05ca..bbf96a3312 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py @@ -1,10 +1,15 @@ """ Metrics for text similarity """ + from difflib import SequenceMatcher +from PIL import Image +import torch +import torch.nn.functional as F import numpy as np from sentence_transformers import SentenceTransformer, util +from transformers import CLIPImageProcessor, CLIPModel from tqdm import tqdm @@ -68,9 +73,7 @@ def evaluate_divergency(tokenizer, data_gold, data_prediction): fdt_list.append(fdt) num_matched = sum(block.size for block in blocks) - sdt = ( - len(b_indexes) - num_matched - ) + sdt = len(b_indexes) - num_matched sdt_list.append(sdt) sdt_norm = sdt / len(b_indexes) sdtn_list.append(sdt_norm) @@ -104,7 +107,7 @@ def evaluate_divergency(tokenizer, data_gold, data_prediction): return metric_dict, metric_per_question -class SimilarityMetric: +class TextSimilarity: def __init__(self, model_id) -> None: self.model = SentenceTransformer(model_id) @@ -112,9 +115,47 @@ def evaluate(self, gt, prediction): return evaluate_similarity(self.model, gt, prediction) -class DivergencyMetric: +class TextDivergency: def __init__(self, tokenizer) -> None: self.tokenizer = tokenizer def evaluate(self, gt, prediction): return evaluate_divergency(self.tokenizer, gt, prediction) + + +# Image metrics +def evaluate_image_similarity(processor, model, data_gold, data_prediction): + images_gold = data_gold["images"].values + images_prediction = data_prediction["images"].values + + metric_per_image = [] + for gold, prediction in tqdm( + zip(images_gold, images_prediction), desc="Image Similarity evaluation" + ): + gold_image = Image.open(gold) + prediction_image = Image.open(prediction) + + gold_inputs = processor(images=gold_image, return_tensors="pt")["pixel_values"] + prediction_inputs = processor(images=prediction_image, return_tensors="pt")[ + "pixel_values" + ] + + with torch.no_grad(): + gold_outputs = model.get_image_features(gold_inputs) + prediction_outputs = model.get_image_features(prediction_inputs) + + cos_sim = F.cosine_similarity(gold_outputs, prediction_outputs) + print("cos_sim: ", cos_sim.item()) + metric_per_image.append(cos_sim.item()) + + metric_dict = {"similarity": np.mean(metric_per_image)} + return metric_dict, {"similarity": metric_per_image} + + +class ImageSimilarity: + def __init__(self, model_id) -> None: + self.processor = CLIPImageProcessor.from_pretrained(model_id) + self.model = CLIPModel.from_pretrained(model_id).eval() + + def evaluate(self, gt, prediction): + return evaluate_image_similarity(self.processor, self.model, gt, prediction) diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py index 8efca22059..3798bb044c 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py @@ -5,28 +5,40 @@ import pandas as pd import logging from datasets import load_dataset -from optimum.exporters import TasksManager +from diffusers import DiffusionPipeline from optimum.intel.openvino import OVModelForCausalLM from optimum.utils import NormalizedConfigManager, NormalizedTextConfig from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM -from . import Evaluator +from optimum.exporters.tasks import TasksManager +from optimum.intel import ( + OVLatentConsistencyModelPipeline, + OVStableDiffusionPipeline, + OVStableDiffusionXLPipeline, +) + +import openvino_genai +from whowhatbench import EVALUATOR_REGISTRY, MODELTYPE2TASK + # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -TasksManager._SUPPORTED_MODEL_TYPE["stablelm-epoch"] = TasksManager._SUPPORTED_MODEL_TYPE["llama"] +TasksManager._SUPPORTED_MODEL_TYPE["stablelm-epoch"] = ( + TasksManager._SUPPORTED_MODEL_TYPE["llama"] +) NormalizedConfigManager._conf["stablelm-epoch"] = NormalizedTextConfig.with_args( num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", ) -class GenAIModelWrapper(): +class GenAIModelWrapper: """ A helper class to store additional attributes for GenAI models """ + def __init__(self, model, model_dir): self.model = model self.config = AutoConfig.from_pretrained(model_dir) @@ -38,7 +50,7 @@ def __getattr__(self, attr): return getattr(self.model, attr) -def load_genai_pipeline(model_dir, device="CPU"): +def load_text_genai_pipeline(model_dir, device="CPU"): try: import openvino_genai except ImportError: @@ -48,13 +60,17 @@ def load_genai_pipeline(model_dir, device="CPU"): return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device), model_dir) -def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False): +def load_text_model( + model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): if use_hf: logger.info("Using HF Transformers API") - return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device_map=device.lower()) + return AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower() + ) if use_genai: - return load_genai_pipeline(model_id, device) + return load_text_genai_pipeline(model_id, device) if ov_config: with open(ov_config) as f: @@ -62,7 +78,9 @@ def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=F else: ov_options = None try: - model = OVModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device=device, ov_config=ov_options) + model = OVModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_options + ) except ValueError: config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) model = OVModelForCausalLM.from_pretrained( @@ -71,11 +89,67 @@ def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=F trust_remote_code=True, use_cache=True, device=device, - ov_config=ov_options + ov_config=ov_options, ) return model +TEXT2IMAGE_TASK2CLASS = { + "sd": OVStableDiffusionPipeline, + "sd-xl": OVStableDiffusionXLPipeline, + "sd-lcm": OVLatentConsistencyModelPipeline, +} + + +def load_text2image_model( + model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if ov_config: + with open(ov_config) as f: + ov_options = json.load(f) + else: + ov_options = None + + if use_hf: + return DiffusionPipeline.from_pretrained(model_id, trust_remote_code=True) + + TEXT2IMAGEPipeline = TEXT2IMAGE_TASK2CLASS[model_type] + + try: + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_options + ) + except ValueError: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_options, + ) + return model + + +def load_model( + model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + from .registry import MODELTYPE2TASK + + if model_id is None: + return None + + if model_type == "text": + return load_text_model(model_id, device, ov_config, use_hf, use_genai) + elif MODELTYPE2TASK[model_type] == "image-generation": + return load_text2image_model( + model_type, model_id, device, ov_config, use_hf, use_genai + ) + else: + raise ValueError(f"Unsupported model type: {model_type}") + + def load_prompts(args): if args.dataset is None: return None @@ -93,7 +167,7 @@ def load_prompts(args): res = data[args.dataset_field] - res = {"questions": list(res)} + res = {"prompts": list(res)} return res @@ -127,7 +201,14 @@ def parse_args(): "I defined and not exists them will be generated by base_model evaluation.", ) parser.add_argument( - "--text-encoder", + "--model-type", + type=str, + choices=["text", "sd", "sd-xl", "sd-lcm"], + default="text", + help="Indicated the model type, e.g. 'text', 'sd'.", + ) + parser.add_argument( + "--data-encoder", type=str, default="sentence-transformers/all-mpnet-base-v2", help="Model for measurement of similarity between base_model and target_model." @@ -145,7 +226,7 @@ def parse_args(): parser.add_argument( "--dataset-field", type=str, - default="questions", + default="text", help="The name of field in dataset for prompts. For example question or context in squad." "Will be used only if dataset is defined.", ) @@ -258,44 +339,120 @@ def diff_strings(a: str, b: str, *, use_loguru_colors: bool = False) -> str: def genai_gen_answer(model, tokenizer, question, max_new_tokens, skip_question): - out = model.generate(question, max_new_tokens=max_new_tokens) + config = openvino_genai.GenerationConfig() + config.max_new_tokens = max_new_tokens + out = model.generate(question, config) return out +def get_evaluator(base_model, args): + # config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + # task = TasksManager.infer_task_from_model(config._name_or_path) + # TODO: Add logic to auto detect task based on model_id (TaskManager does not work for locally saved models) + task = MODELTYPE2TASK[args.model_type] + + try: + EvaluatorCLS = EVALUATOR_REGISTRY[task] + prompts = load_prompts(args) + + if task == "text-generation": + tokenizer = load_tokenizer(args) + return EvaluatorCLS( + base_model=base_model, + gt_data=args.gt_data, + test_data=prompts, + tokenizer=tokenizer, + similarity_model_id=args.data_encoder, + num_samples=args.num_samples, + language=args.language, + gen_answer_fn=genai_gen_answer if args.genai else None, + ) + elif task == "image-generation": + return EvaluatorCLS( + base_model=base_model, + gt_data=args.gt_data, + test_data=prompts, + num_samples=args.num_samples, + ) + else: + raise ValueError(f"Unsupported task: {task}") + + except KeyError: + raise ValueError( + f"Attempted to load evaluator for '{task}', but no evaluator for this model type found!" + "Supported model types: {', '.join(EVALUATOR_REGISTRY.keys())}" + ) + + +def print_text_results(evaluator): + metric_of_interest = "similarity" + worst_examples = evaluator.worst_examples(top_k=5, metric=metric_of_interest) + for i, e in enumerate(worst_examples): + ref_text = "" + actual_text = "" + diff = "" + for l1, l2 in zip( + e["source_model"].splitlines(), e["optimized_model"].splitlines() + ): + if l1 == "" and l2 == "": + continue + ref_text += l1 + "\n" + actual_text += l2 + "\n" + diff += diff_strings(l1, l2) + "\n" + + logger.info( + "--------------------------------------------------------------------------------------" + ) + logger.info("## Reference text %d:\n%s", i + 1, ref_text) + logger.info("## Actual text %d:\n%s", i + 1, actual_text) + logger.info("## Diff %d: ", i + 1) + logger.info(diff) + + +def print_image_results(evaluator): + metric_of_interest = "similarity" + worst_examples = evaluator.worst_examples(top_k=1, metric=metric_of_interest) + for i, e in enumerate(worst_examples): + logger.info( + "--------------------------------------------------------------------------------------" + ) + logger.info(f"Top-{i+1} example:") + logger.info(e) + + def main(): args = parse_args() check_args(args) - prompts = load_prompts(args) - tokenizer = load_tokenizer(args) if args.gt_data and os.path.exists(args.gt_data): - evaluator = Evaluator( - base_model=None, - gt_data=args.gt_data, - test_data=prompts, - tokenizer=tokenizer, - similarity_model_id=args.text_encoder, - num_samples=args.num_samples, - language=args.language, - ) + evaluator = get_evaluator(None, args) else: - base_model = load_model(args.base_model, args.device, args.ov_config, args.hf, args.genai) - evaluator = Evaluator( - base_model=base_model, - test_data=prompts, - tokenizer=tokenizer, - similarity_model_id=args.text_encoder, - num_samples=args.num_samples, - language=args.language, - gen_answer_fn=genai_gen_answer if args.genai else None + base_model = load_model( + args.model_type, + args.base_model, + args.device, + args.ov_config, + args.hf, + args.genai, ) + evaluator = get_evaluator(base_model, args) + if args.gt_data: evaluator.dump_gt(args.gt_data) del base_model if args.target_model: - target_model = load_model(args.target_model, args.device, args.ov_config, args.hf, args.genai) - all_metrics_per_question, all_metrics = evaluator.score(target_model, genai_gen_answer if args.genai else None) + target_model = load_model( + args.model_type, + args.target_model, + args.device, + args.ov_config, + args.hf, + args.genai, + ) + all_metrics_per_question, all_metrics = evaluator.score( + target_model, genai_gen_answer if args.genai else None + ) logger.info("Metrics for model: %s", args.target_model) logger.info(all_metrics) @@ -307,25 +464,11 @@ def main(): df = pd.DataFrame(all_metrics) df.to_csv(os.path.join(args.output, "metrics.csv")) - if args.verbose: - metric_of_interest = "similarity" - worst_examples = evaluator.worst_examples(top_k=5, metric=metric_of_interest) - for i, e in enumerate(worst_examples): - ref_text = "" - actual_text = "" - diff = "" - for l1, l2 in zip(e["source_model"].splitlines(), e["optimized_model"].splitlines()): - if l1 == "" and l2 == "": - continue - ref_text += l1 + "\n" - actual_text += l2 + "\n" - diff += diff_strings(l1, l2) + "\n" - - logger.info("--------------------------------------------------------------------------------------") - logger.info("## Reference text %d:\n%s", i + 1, ref_text) - logger.info("## Actual text %d:\n%s", i + 1, actual_text) - logger.info("## Diff %d: ", i + 1) - logger.info(diff) + if args.verbose and args.target_model is not None: + if args.model_type == "text": + print_text_results(evaluator) + elif "sd" in args.model_type: + print_image_results(evaluator) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index b7a23efa98..7be4478108 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ find_python3 = true build_args = ["--parallel", "--target", "py_generate_pipeline"] install_args = ["--strip"] install_components = ["wheel_genai"] +options = {"BUILD_TOKENIZERS" = "OFF"} [build-system] requires = [ diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 3ca30f775b..2a8f26ff4d 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -14,7 +14,7 @@ add_subdirectory(cpp/visual_language_chat) add_subdirectory(cpp/speculative_decoding_lm) add_subdirectory(cpp/benchmark_genai) add_subdirectory(cpp/whisper_speech_recognition) -add_subdirectory(cpp/stable_diffusion) +add_subdirectory(cpp/text2image) install(FILES requirements.txt DESTINATION samples COMPONENT cpp_samples_genai) @@ -25,10 +25,10 @@ install(DIRECTORY cpp/greedy_causal_lm cpp/multinomial_causal_lm # Don't install prompt_lookup_decoding_lm and speculative_decoding_lm because they don't use openvino_genai library and arent verifyed yet. - # Don't install continuous_batching_accuracy and continuous_batching_benchmark because they depend on json. + # Don't install continuous_batching_accuracy and continuous_batching_benchmark because CB isn't ready. cpp/visual_language_chat cpp/whisper_speech_recognition - cpp/stable_diffusion + cpp/text2image cpp/lora_greedy_causal_lm DESTINATION samples/cpp COMPONENT cpp_samples_genai) @@ -38,6 +38,6 @@ install(DIRECTORY python/greedy_causal_lm python/multinomial_causal_lm python/whisper_speech_recognition - # python/stable_diffusion + # python/text2image DESTINATION samples/python COMPONENT cpp_samples_genai USE_SOURCE_PERMISSIONS) diff --git a/samples/cpp/stable_diffusion/README.md b/samples/cpp/stable_diffusion/README.md deleted file mode 100644 index 5e6bfd0f9d..0000000000 --- a/samples/cpp/stable_diffusion/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Stable Diffusion C++ Image Generation Pipeline - -This example showcases inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::Text2ImagePipeline` and uses a text prompt as input source. - -Users can change the sample code and play with the following generation parameters: - -- Change width or height of generated image -- Generate multiple images per prompt -- Adjust a number of inference steps -- Play with [guidance scale](https://huggingface.co/spaces/stabilityai/stable-diffusion/discussions/9) (read [more details](https://arxiv.org/abs/2207.12598)) -- (SD 1.x, 2.x only) Add negative prompt when guidance scale > 1 - -## Download and convert the models and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 -``` - -## Run - -`stable_diffusion ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'` - -### Examples - -Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` - - ![](./512x512.bmp) - -## Supported models - -Models can be downloaded from [OpenAI HiggingFace](https://huggingface.co/openai). This sample can run the following list of models, but not limitied to: - -- [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5) -- [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) -- [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) -- [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) -- [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) - -## Note - -- Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample: - -C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method. diff --git a/samples/cpp/stable_diffusion/main.cpp b/samples/cpp/stable_diffusion/main.cpp deleted file mode 100644 index 05fc7a2535..0000000000 --- a/samples/cpp/stable_diffusion/main.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "openvino/genai/text2image/pipeline.hpp" - -#include "imwrite.hpp" - -int32_t main(int32_t argc, char* argv[]) try { - OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); - - const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well - - ov::genai::Text2ImagePipeline pipe(models_path, device); - ov::Tensor image = pipe.generate(prompt, - ov::genai::width(512), - ov::genai::height(512), - ov::genai::num_inference_steps(20)); - - imwrite("image.bmp", image, true); - - return EXIT_SUCCESS; -} catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; -} catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; -} diff --git a/samples/cpp/stable_diffusion/512x512.bmp b/samples/cpp/text2image/512x512.bmp similarity index 100% rename from samples/cpp/stable_diffusion/512x512.bmp rename to samples/cpp/text2image/512x512.bmp diff --git a/samples/cpp/stable_diffusion/CMakeLists.txt b/samples/cpp/text2image/CMakeLists.txt similarity index 56% rename from samples/cpp/stable_diffusion/CMakeLists.txt rename to samples/cpp/text2image/CMakeLists.txt index a7a6f067b3..ca0f832f6d 100644 --- a/samples/cpp/stable_diffusion/CMakeLists.txt +++ b/samples/cpp/text2image/CMakeLists.txt @@ -8,7 +8,7 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) -# create executable +# create main sample executable add_executable(stable_diffusion ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp @@ -26,3 +26,22 @@ install(TARGETS stable_diffusion RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) + +# create LoRA sample executable + +add_executable(lora_stable_diffusion + ${CMAKE_CURRENT_SOURCE_DIR}/lora.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/imwrite.cpp) + +target_include_directories(lora_stable_diffusion PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(lora_stable_diffusion PRIVATE openvino::genai) + +set_target_properties(lora_stable_diffusion PROPERTIES + COMPILE_PDB_NAME lora_stable_diffusion + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS lora_stable_diffusion + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/text2image/README.md b/samples/cpp/text2image/README.md new file mode 100644 index 0000000000..f73da334f4 --- /dev/null +++ b/samples/cpp/text2image/README.md @@ -0,0 +1,78 @@ +# Text to Image C++ Generation Pipeline + +Examples in this folder showcase inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::Text2ImagePipeline` and uses a text prompt as input source. + +There are two sample files: + - [`main.cpp`](./main.cpp) demonstrates basic usage of the text to image pipeline + - [`lora.cpp`](./lora.cpp) shows how to apply LoRA adapters to the pipeline + +Users can change the sample code and play with the following generation parameters: + +- Change width or height of generated image +- Generate multiple images per prompt +- Adjust a number of inference steps +- Play with [guidance scale](https://huggingface.co/spaces/stabilityai/stable-diffusion/discussions/9) (read [more details](https://arxiv.org/abs/2207.12598)) +- (SD 1.x, 2.x only) Add negative prompt when guidance scale > 1 +- Apply multiple different LoRA adapters and mix them with different blending coefficients + +## Download and convert the models and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 +``` + +## Run + +`stable_diffusion ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'` + +### Examples + +Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` + + ![](./512x512.bmp) + +## Supported models + +Models can be downloaded from [HuggingFace](https://huggingface.co/models). This sample can run the following list of models, but not limited to: + +- [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5) +- [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) +- [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) +- [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) +- [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) +- [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) +- [stabilityai/stable-diffusion-xl-base-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9) + +## Run with optional LoRA adapters + +LoRA adapters can be connected to the pipeline and modify generated images to have certain style, details or quality. Adapters are supported in Safetensors format and can be downloaded from public sources like [Civitai](https://civitai.com) or [HuggingFace](https://huggingface.co/models) or trained by the user. Adapters compatible with a base model should be used only. A weighted blend of multiple adapters can be applied by specifying multple adapter files with corresponding alpha parameters in command line. Check `lora.cpp` source code to learn how to enable adapters and specify them in each `generate` call. + +Here is an example how to run the sample with a single adapter. First download adapter file from https://civitai.com/models/67927/soulcard page manually and save it as `soulcard.safetensors`. Or download it from command line: + +`wget -O soulcard.safetensors https://civitai.com/api/download/models/72591` + +Then run `lora_stable_diffusion` executable: + +`./lora_stable_diffusion dreamlike_anime_1_0_ov/FP16 'curly-haired unicorn in the forest, anime, line' soulcard.safetensors 0.7` + +The sample generates two images with and without adapters applied using the same prompt: + - `lora.bmp` with adapters applied + - `baseline.bmp` without adapters applied + +Check the difference: + +With adapter | Without adapter +:---:|:---: +![](./lora.bmp) | ![](./baseline.bmp) + + +## Note + +- Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample: + +C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method. diff --git a/samples/cpp/text2image/baseline.bmp b/samples/cpp/text2image/baseline.bmp new file mode 100644 index 0000000000..aa9a51ccf6 Binary files /dev/null and b/samples/cpp/text2image/baseline.bmp differ diff --git a/samples/cpp/stable_diffusion/imwrite.cpp b/samples/cpp/text2image/imwrite.cpp similarity index 100% rename from samples/cpp/stable_diffusion/imwrite.cpp rename to samples/cpp/text2image/imwrite.cpp diff --git a/samples/cpp/stable_diffusion/imwrite.hpp b/samples/cpp/text2image/imwrite.hpp similarity index 100% rename from samples/cpp/stable_diffusion/imwrite.hpp rename to samples/cpp/text2image/imwrite.hpp diff --git a/samples/cpp/text2image/lora.bmp b/samples/cpp/text2image/lora.bmp new file mode 100644 index 0000000000..62859e4bdd Binary files /dev/null and b/samples/cpp/text2image/lora.bmp differ diff --git a/samples/cpp/text2image/lora.cpp b/samples/cpp/text2image/lora.cpp new file mode 100644 index 0000000000..0db7b55fe9 --- /dev/null +++ b/samples/cpp/text2image/lora.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/pipeline.hpp" + +#include "imwrite.hpp" + +int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " '' [ ...]]"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::AdapterConfig adapter_config; + // Multiple LoRA adapters applied simultaniously are supported, parse them all and corresponding alphas from cmd parameters: + for(size_t i = 0; i < (argc - 3)/2; ++i) { + ov::genai::Adapter adapter(argv[3 + 2*i]); + float alpha = std::atof(argv[3 + 2*i + 1]); + adapter_config.add(adapter, alpha); + } + + // LoRA adapters passed to the constructor will be activated by default in next generates + ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); + + std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; + ov::Tensor image = pipe.generate(prompt, + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("lora.bmp", image, true); + + std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; + image = pipe.generate(prompt, + ov::genai::adapters(), // passing adapters in generate overrides adapters set in the constructor; adapters() means no adapters + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("baseline.bmp", image, true); + + return EXIT_SUCCESS; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/text2image/main.cpp b/samples/cpp/text2image/main.cpp new file mode 100644 index 0000000000..02c632d53e --- /dev/null +++ b/samples/cpp/text2image/main.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/pipeline.hpp" + +#include "imwrite.hpp" + +namespace { + + void imwrite_output_imgs(const ov::Tensor& output) { + ov::Shape out_shape = output.get_shape(); + + if (out_shape[0] == 1) { + imwrite("image.bmp", output, true); + return; + } + + ov::Shape img_shape = {1, out_shape[1], out_shape[2], out_shape[3]}; + size_t img_size = output.get_size() / out_shape[0]; + + ov::Tensor image(output.get_element_type(), img_shape); + uint8_t* out_data = output.data(); + uint8_t* img_data = image.data(); + + for (int img_num = 0; img_num < out_shape[0]; ++img_num) { + std::memcpy(img_data, out_data + img_size * img_num, img_size * sizeof(uint8_t)); + + char img_name[25]; + sprintf(img_name, "image_%d.bmp", img_num); + + imwrite(img_name, image, true); + } + } + +} //namespace + +int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::Text2ImagePipeline pipe(models_path, device); + ov::Tensor image = pipe.generate(prompt, + ov::genai::width(512), + ov::genai::height(512), + ov::genai::num_inference_steps(20), + ov::genai::num_images_per_prompt(1)); + + imwrite_output_imgs(image); + + return EXIT_SUCCESS; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 0df2b5ab5c..9a1b21632f 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -1,9 +1,11 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH ) file(DOWNLOAD @@ -14,11 +16,11 @@ file(DOWNLOAD add_executable(visual_language_chat visual_language_chat.cpp load_image.cpp) target_include_directories(visual_language_chat PRIVATE "${CMAKE_CURRENT_SOUCE_DIR}" "${CMAKE_BINARY_DIR}") target_link_libraries(visual_language_chat PRIVATE openvino::genai) + set_target_properties(visual_language_chat PROPERTIES - COMPILE_PDB_NAME chat_sample + COMPILE_PDB_NAME visual_language_chat # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(visual_language_chat PRIVATE cxx_std_11) install(TARGETS visual_language_chat RUNTIME DESTINATION samples_bin/ diff --git a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py index a08c3ad55b..7d2f0f1175 100644 --- a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py +++ b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py @@ -9,22 +9,58 @@ from transformers import AutoModel, AutoTokenizer, AutoProcessor, TextIteratorStreamer from transformers.generation import GenerationMixin from transformers import AutoConfig, GenerationConfig -from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPooling +from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask from pathlib import Path from huggingface_hub import snapshot_download import types -from typing import Optional, Tuple, List +from typing import Optional, Tuple, List, Union from openvino.runtime import opset13 import openvino as ov import openvino_tokenizers import numpy as np import gc +from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher +import time text_emb_path = Path("embed_tokens.xml") image_emb_path = Path("image_encoder.xml") resampler_path = Path("resampler.xml") llm_path = Path("language_model.xml") +class InsertSlice(MatcherPass): + def __init__(self): + MatcherPass.__init__(self) + self.model_changed = False + + param = WrapType("opset10.Result") + + def callback(matcher: Matcher) -> bool: + root = matcher.get_match_root() + if root is None: + return False + if len(root.get_output_partial_shape(0)) == 3: + parent = root.input_value(0).get_node() + grand_parent = parent.input_value(0).get_node() + + grand_parent_output = parent.input(0).get_source_output() + consumers = grand_parent_output.get_target_inputs() + start = np.array([0, -1, 0], dtype=np.int32) + stop = np.array([1, -2, grand_parent_output.get_partial_shape()[-1].get_length()], dtype=np.int32) + step = np.array([1, -1, 1], dtype=np.int32) + axes = np.array([0, 1, 2], dtype=np.int32) + slice = opset13.slice(grand_parent, start, stop, step, axes, name="inserted_slice") + for consumer in consumers: + consumer.replace_source_output(slice.output(0)) + self.model_changed = True + # Use new operation for additional matching + self.register_new_node(slice) + print("applied slice for lm head") + + return True + + self.register_matcher(Matcher(param, "InsertSlice"), callback) + def model_has_state(ov_model: ov.Model): return len(ov_model.get_sinks()) > 0 @@ -324,13 +360,151 @@ def convert_vision_encoder(model, model_dir): tgt_sizes = torch.tensor([[23, 45]]) if not (model_dir / image_emb_path).exists(): print("⌛ Convert Image embedding model") + def siglip_vis_embed_forward( + self, + pixel_values: torch.FloatTensor, + patch_attention_mask: torch.BoolTensor, + tgt_sizes: Optional[torch.IntTensor] = None, + position_ids: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + patch_embeds = self.patch_embedding(pixel_values) + embeddings = patch_embeds.flatten(2).transpose(1, 2) + + if position_ids is None: + batch_size = pixel_values.size(0) + max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) + max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size + boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) + position_ids = torch.full( + size=( + batch_size, + max_nb_patches_h * max_nb_patches_w, + ), + fill_value=0, + ) + + for batch_idx, p_attn_mask in enumerate(patch_attention_mask): + if tgt_sizes is not None: + nb_patches_h = tgt_sizes[batch_idx][0] + nb_patches_w = tgt_sizes[batch_idx][1] + else: + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() + + fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) + fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) + + bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) + bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) + + pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() + position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + + position_ids = position_ids.to(self.position_embedding.weight.device) + + embeddings = embeddings + self.position_embedding(position_ids) + return embeddings + + def siglip_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, key_states, value_states, attention_mask, is_causal=attention_mask is None + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None + + def siglip_transformer_forward( + self, + pixel_values, + patch_attention_mask: Optional[torch.BoolTensor] = None, + tgt_sizes: Optional[torch.IntTensor] = None, + position_ids: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size = pixel_values.size(0) + if patch_attention_mask is None: + patch_attention_mask = torch.ones( + size=( + batch_size, + pixel_values.size(2) // self.config.patch_size, + pixel_values.size(3) // self.config.patch_size, + ), + dtype=torch.bool, + device=pixel_values.device, + ) + + hidden_states = self.embeddings( + pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes, position_ids=position_ids + ) + + patch_attention_mask = patch_attention_mask.view(batch_size, -1) + attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) if not self._use_flash_attention_2 else patch_attention_mask + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + if not return_dict: + return (last_hidden_state, None) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=None, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + vpm = model.vpm + vpm.embeddings.forward = types.MethodType(siglip_vis_embed_forward, vpm.embeddings) + for layer in vpm.encoder.layers: + layer.self_attn.forward = types.MethodType(siglip_attn_forward, layer.self_attn) + vpm.forward = types.MethodType(siglip_transformer_forward, vpm) + pixel_values = torch.randn([1, 3, 14, 14490]) patch_attn_mask = torch.zeros((1, 1, 1035), dtype=torch.bool) patch_attn_mask[0, 0, : tgt_sizes[0][0] * tgt_sizes[0][1]] = True - ov_model = ov.convert_model(model.vpm, example_input={"pixel_values": pixel_values, "tgt_sizes": tgt_sizes, "patch_attention_mask": patch_attn_mask}) + position_ids = prepare_vis_position_ids( + pixel_values, patch_attn_mask, tgt_sizes, model.config.vision_config.patch_size, model.config.vision_config.image_size // model.config.patch_size + ) + ov_model = ov.convert_model(vpm, example_input={"pixel_values": pixel_values, "position_ids": position_ids, "patch_attention_mask": patch_attn_mask}) ov.save_model(ov_model, model_dir / image_emb_path) del ov_model cleanup_torchscript_cache() + gc.collect() print("✅ Image embedding model successfully converted") if not (model_dir / resampler_path).exists(): @@ -343,7 +517,9 @@ def resampler_forward(self, x, pos_embed, key_padding_mask): q = self.ln_q(self.query) # Q * D - out = self.attn(self._repeat(q, bs), x + pos_embed, x, key_padding_mask=key_padding_mask)[0] # Q * B * D # L * B * D + L * B * D + q_bs = q.unsqueeze(1).repeat(1, bs, 1) + + out = self.attn(q_bs, x + pos_embed, x, key_padding_mask=key_padding_mask)[0] # Q * B * D # L * B * D + L * B * D # out: Q * B * D x = out.permute(1, 0, 2) # B * Q * D @@ -369,6 +545,8 @@ def resampler_forward(self, x, pos_embed, key_padding_mask): ov.save_model(ov_model, model_dir / resampler_path) del ov_model cleanup_torchscript_cache() + del model.resampler + gc.collect() print("✅ Resampler model successfully converted") @@ -380,11 +558,38 @@ def copy_llm_files(model_dir, dst_dir): shutil.copy(model_dir / llm_path.parent / "modeling_navit_siglip.py", model_dir / dst_dir / "modeling_navit_siglip.py") +def prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, num_patches_per_side): + batch_size = pixel_values.size(0) + max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) + max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size + boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side) + position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) + + for batch_idx, p_attn_mask in enumerate(patch_attention_mask): + if tgt_sizes is not None: + nb_patches_h = tgt_sizes[batch_idx][0] + nb_patches_w = tgt_sizes[batch_idx][1] + else: + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() + + fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) + fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) + + bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) + bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) + + pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten() + position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + + return position_ids + + core = ov.Core() class OvModelForCausalLMWithEmb(GenerationMixin): - def __init__(self, model_dir, device="CPU", ov_config=None, compile=True) -> None: + def __init__(self, model_dir, device="CPU", ov_config=None, compile=True, slice_lm_head=True) -> None: self._supports_cache_class = False self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) self.config.is_decoder = True @@ -393,6 +598,8 @@ def __init__(self, model_dir, device="CPU", ov_config=None, compile=True) -> Non model_dir = Path(model_dir) self.model = core.read_model(model_dir / "language_model.xml") self.token_emb = core.read_model(model_dir / "embed_tokens.xml") + if slice_lm_head: + self.slice_lm_head() self.request = None self.token_emb_request = None self._device = device.upper() @@ -402,9 +609,16 @@ def __init__(self, model_dir, device="CPU", ov_config=None, compile=True) -> Non self._past_length = None self.input_names = [input_t.get_any_name() for input_t in self.model.inputs] self.main_input_name = "input_ids" + self.llm_times = [] if compile: self.compile() + def slice_lm_head(self): + manager = Manager() + manager.register_pass(InsertSlice()) + manager.run_passes(self.model) + self.model.validate_nodes_and_infer_types() + def compile(self): if self.request is None: self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request() @@ -446,6 +660,7 @@ def prepare_inputs( inputs = {} # past_key_values are not used explicitly, instead they are handled inside the model if past_key_values is None: + self.llm_times = [] # This is the first iteration in a sequence, reset all states if self.request is not None: self.request.reset_state() @@ -657,20 +872,39 @@ def get_vllm_embedding(self, data): for i in range(B): patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True - vision_batch_size = 1 + vision_batch_size = 32 all_pixel_values = all_pixel_values if B > vision_batch_size: hs = [] for i in range(0, B, vision_batch_size): start_idx = i end_idx = i + vision_batch_size - tmp_hs = torch.from_numpy( - self.vpm([all_pixel_values[start_idx:end_idx], patch_attn_mask[start_idx:end_idx], tgt_sizes[start_idx:end_idx]])[0] + block_pxl_values = all_pixel_values[start_idx:end_idx] + block_patch_attn_mask = patch_attn_mask[start_idx:end_idx] + block_tgt_sizes = tgt_sizes[start_idx:end_idx] + block_position_ids = prepare_vis_position_ids( + block_pxl_values, + block_patch_attn_mask, + block_tgt_sizes, + self.config.vision_config.patch_size, + self.config.vision_config.image_size // self.config.patch_size, ) + start = time.perf_counter() + tmp_hs = torch.from_numpy(self.vpm([block_pxl_values, block_patch_attn_mask, block_position_ids])[0]) + self.vpm_times.append(time.perf_counter() - start) hs.append(tmp_hs) vision_embedding = torch.cat(hs, dim=0) else: - vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, tgt_sizes])[0]) + position_ids = prepare_vis_position_ids( + all_pixel_values, + patch_attn_mask, + tgt_sizes, + self.config.vision_config.patch_size, + self.config.vision_config.image_size // self.config.patch_size, + ) + start = time.perf_counter() + vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0]) + vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0]) vision_embedding = self.resampler(vision_embedding, tgt_sizes) start = 0 @@ -801,6 +1035,8 @@ def chat( use_image_id=None, **kwargs, ): + self.vpm_times = [] + self.resampler_times = [] if isinstance(msgs[0], list): batched = True else: @@ -844,7 +1080,6 @@ def chat( copy_msgs = deepcopy(msgs) assert len(msgs) > 0, "msgs is empty" - assert sampling or not stream, "if use stream mode, make sure sampling=True" if image is not None and isinstance(copy_msgs[0]["content"], str): copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]] @@ -882,7 +1117,6 @@ def chat( generation_config = {"top_p": 0.8, "top_k": 100, "temperature": 0.7, "do_sample": True, "repetition_penalty": 1.05} else: generation_config = { - "num_beams": 3, "repetition_penalty": 1.2, } @@ -958,8 +1192,8 @@ def main(): gc.collect() convert_vision_encoder(model, model_dir) - ov_cpm = init_model(model_dir, "CPU") - print(ov_cpm.chat(Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw), [{"role": "user", "content": "What is unusual on this image?"}], ov_cpm.processor.tokenizer)) + # ov_cpm = init_model(model_dir, "CPU") + # print(ov_cpm.chat(Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw), [{"role": "user", "content": "What is unusual on this image?"}], ov_cpm.processor.tokenizer)) if "__main__" == __name__: main() diff --git a/samples/cpp/visual_language_chat/load_image.cpp b/samples/cpp/visual_language_chat/load_image.cpp index 85fe7e2fbe..855f7567bf 100644 --- a/samples/cpp/visual_language_chat/load_image.cpp +++ b/samples/cpp/visual_language_chat/load_image.cpp @@ -13,7 +13,7 @@ ov::Tensor utils::load_image(const std::filesystem::path& image_path) { image_path.string().c_str(), &x, &y, &channels_in_file, desired_channels); if (!image) { - throw std::runtime_error{"Failed to load the image"}; + throw std::runtime_error{"Failed to load the image."}; } struct SharedImageAllocator { unsigned char* image; @@ -22,11 +22,11 @@ ov::Tensor utils::load_image(const std::filesystem::path& image_path) { if (channels * height * width == bytes) { return image; } - throw std::runtime_error{"Unexpected number of bytes was requested to allocate"}; + throw std::runtime_error{"Unexpected number of bytes was requested to allocate."}; } void deallocate(void*, size_t bytes, size_t) { if (channels * height * width != bytes) { - throw std::runtime_error{"Unexpected number of bytes was requested to deallocate"}; + throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."}; } std::free(image); image = nullptr; diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md index ab5a76c70a..fec5d9194f 100644 --- a/samples/cpp/whisper_speech_recognition/README.md +++ b/samples/cpp/whisper_speech_recognition/README.md @@ -23,7 +23,7 @@ Prepare audio file in wav format with sampling rate 16k Hz. Output: text transcription of `sample.wav` -Models can be downloaded from [OpenAI HiggingFace](https://huggingface.co/openai). +Models can be downloaded from [OpenAI HuggingFace](https://huggingface.co/openai). Supported Models: [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) diff --git a/samples/python/vlm_chat_sample/README.md b/samples/python/vlm_chat_sample/README.md new file mode 100644 index 0000000000..246cbe3cd8 --- /dev/null +++ b/samples/python/vlm_chat_sample/README.md @@ -0,0 +1,38 @@ +# Python vlm_chat_sample that supports VLM models + +This example showcases inference of text-generation Vision Language Models (VLMs): `miniCPM-V-2_6` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.VLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +``` +# TODO: add optimum cli command for miniCPM-V-2_6 when available + +## Run: +[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. + +`vlm_chat_sample.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. # TODO: examples of larger models +Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/vlm_chat_sample/vlm_chat_sample.py b/samples/python/vlm_chat_sample/vlm_chat_sample.py new file mode 100644 index 0000000000..686fae939f --- /dev/null +++ b/samples/python/vlm_chat_sample/vlm_chat_sample.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse + +import numpy as np +import openvino_genai +from PIL import Image +from openvino import Tensor + + +def streamer(subword: str) -> bool: + ''' + + Args: + subword: sub-word of the generated text. + + Returns: Return flag corresponds whether generation should be stopped. + + ''' + print(subword, end='', flush=True) + + # No value is returned as in this example we don't want to stop the generation in this method. + # "return None" will be treated the same as "return False". + + +def read_image(path: str) -> Tensor: + ''' + + Args: + path: The path to the image. + + Returns: the ov.Tensor containing the image. + + ''' + pic = Image.open(path) + image_data = np.array(pic.getdata()).reshape(1, 3, pic.size[1], pic.size[0]).astype(np.byte) + return Tensor(image_data) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('image_dir') + args = parser.parse_args() + + image = read_image(args.image_dir) + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.VLMPipeline(args.model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + while True: + try: + prompt = input('question:\n') + except EOFError: + break + pipe(prompt, image=image, generation_config=config, streamer=streamer) + print('\n----------') + pipe.finish_chat() + + +if '__main__' == __name__: + main() diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md index ab5a76c70a..fec5d9194f 100644 --- a/samples/python/whisper_speech_recognition/README.md +++ b/samples/python/whisper_speech_recognition/README.md @@ -23,7 +23,7 @@ Prepare audio file in wav format with sampling rate 16k Hz. Output: text transcription of `sample.wav` -Models can be downloaded from [OpenAI HiggingFace](https://huggingface.co/openai). +Models can be downloaded from [OpenAI HuggingFace](https://huggingface.co/openai). Supported Models: [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) diff --git a/samples/requirements.txt b/samples/requirements.txt index b8cc30895e..4821d6dbef 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -3,4 +3,4 @@ optimum[openvino]==1.22.0 einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diffusers==0.30.3 -torchvision +torchvision # needed for mini-CPM export script. Need to remove when we switch to exporting with optimum-intel. diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index d1c38b2325..20b547052c 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -47,11 +47,13 @@ ov_genai_build_jinja2cpp() # Library -file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp", "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c") +file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c") set(TARGET_NAME openvino_genai) add_library(${TARGET_NAME} SHARED ${SOURCE_FILES}) -add_dependencies(${TARGET_NAME} openvino_tokenizers) +if(TARGET openvino_tokenizers) + add_dependencies(${TARGET_NAME} openvino_tokenizers) +endif() add_library(openvino::genai ALIAS ${TARGET_NAME}) target_include_directories(${TARGET_NAME} diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 5d6bce880f..a1244d3d75 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -161,8 +161,6 @@ static constexpr ov::Property presence_penalty{"presence_penalty"}; static constexpr ov::Property frequency_penalty{"frequency_penalty"}; static constexpr ov::Property rng_seed{"rng_seed"}; -static constexpr AdaptersProperty adapters; - // Predefined Configs OPENVINO_GENAI_EXPORTS GenerationConfig beam_search(); OPENVINO_GENAI_EXPORTS GenerationConfig greedy(); diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index a14dd1dde0..b21fb43bdb 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -270,7 +270,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { }; OPENVINO_GENAI_EXPORTS std::pair streamer(StreamerVariant func); -std::pair generation_config(const GenerationConfig& config); +OPENVINO_GENAI_EXPORTS std::pair generation_config(const GenerationConfig& config); } // namespace genai } // namespace ov diff --git a/src/cpp/include/openvino/genai/lora_adapter.hpp b/src/cpp/include/openvino/genai/lora_adapter.hpp index 388ccdb941..5748abb807 100644 --- a/src/cpp/include/openvino/genai/lora_adapter.hpp +++ b/src/cpp/include/openvino/genai/lora_adapter.hpp @@ -92,7 +92,9 @@ struct OPENVINO_GENAI_EXPORTS AdapterConfig { class AdaptersProperty : public ov::Property { public: - constexpr AdaptersProperty() : ov::Property("adapters") {} + inline constexpr static const char* name () { return "adapters"; } + + constexpr AdaptersProperty() : ov::Property(name()) {} inline std::pair operator()(const AdapterConfig& config) const { return ov::Property::operator()(config); @@ -154,6 +156,9 @@ class AdaptersProperty : public ov::Property { }; +static constexpr AdaptersProperty adapters; + + class OPENVINO_GENAI_EXPORTS AdapterController { std::shared_ptr m_pimpl; @@ -165,15 +170,12 @@ class OPENVINO_GENAI_EXPORTS AdapterController { AdapterController(std::shared_ptr model, const AdapterConfig& config, const std::string& prefix, std::string device = ""); - // Call it every time when adapter config is changed; if adapter is configured as a static one, this call is not required - void apply(ov::InferRequest& request, const AdapterConfig& config); + // Apply adapters configured in the current config set last time, or set and use new config given as optional `config` argument + void apply(ov::InferRequest& request, const std::optional& config = std::nullopt); // the next call of apply will set all adapter tensors regardless of config change, use this method if full state.reset is called for the controlled model void force_full_apply(bool full_apply = true); - // Apply the same config that was used last time (in initialization or in previous call to apply). - void apply(ov::InferRequest& request); - operator bool() const { return bool(m_pimpl); } diff --git a/src/cpp/include/openvino/genai/processor_config.hpp b/src/cpp/include/openvino/genai/processor_config.hpp index 9a70d1f3ae..bef6754e14 100644 --- a/src/cpp/include/openvino/genai/processor_config.hpp +++ b/src/cpp/include/openvino/genai/processor_config.hpp @@ -14,6 +14,7 @@ namespace ov::genai { /// preprocessor_config.json. class OPENVINO_GENAI_EXPORTS ProcessorConfig { public: + size_t image_size = 980; /// @brief Dimensions of the smaller, non-overlapping patches that the /// input image is divided into before being fed into the /// transformer model. Used to divide image height and width. diff --git a/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp b/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp index 4f348156c2..1f79b039d7 100644 --- a/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp +++ b/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp @@ -7,6 +7,7 @@ #include "openvino/genai/visibility.hpp" #include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/lora_adapter.hpp" #include "openvino/core/any.hpp" #include "openvino/runtime/tensor.hpp" @@ -21,6 +22,7 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel { struct Config { size_t max_position_embeddings = 77; size_t hidden_size = 512; + size_t num_hidden_layers = 13; explicit Config(const std::string& config_path); }; @@ -53,10 +55,15 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel { return compile(device, ov::AnyMap{std::forward(properties)...}); } + void set_adapters(const AdapterConfig& adapters); + ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance); + ov::Tensor get_output_tensor(const size_t idx); + private: Config m_config; + AdapterController m_adapter_controller; ov::InferRequest m_request; std::shared_ptr m_model; diff --git a/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp new file mode 100644 index 0000000000..e46e76f316 --- /dev/null +++ b/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "openvino/genai/visibility.hpp" +#include "openvino/genai/tokenizer.hpp" + +#include "openvino/core/any.hpp" +#include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/properties.hpp" + +namespace ov { +namespace genai { + +class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection { +public: + struct Config { + size_t max_position_embeddings = 77; + size_t hidden_size = 512; + size_t num_hidden_layers = 33; + + explicit Config(const std::string& config_path); + }; + + explicit CLIPTextModelWithProjection(const std::string root_dir); + + CLIPTextModelWithProjection(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties = {}); + + template ::value, bool>::type = true> + CLIPTextModelWithProjection(const std::string& root_dir, + const std::string& device, + Properties&&... properties) + : CLIPTextModelWithProjection(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } + + CLIPTextModelWithProjection(const CLIPTextModelWithProjection&); + + const Config& get_config() const; + + CLIPTextModelWithProjection& reshape(int batch_size); + + CLIPTextModelWithProjection& compile(const std::string& device, const ov::AnyMap& properties = {}); + + template + ov::util::EnableIfAllStringAny compile( + const std::string& device, + Properties&&... properties) { + return compile(device, ov::AnyMap{std::forward(properties)...}); + } + + ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance); + + ov::Tensor get_output_tensor(const size_t idx); + +private: + Config m_config; + ov::InferRequest m_request; + std::shared_ptr m_model; + + Tokenizer m_clip_tokenizer; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/text2image/pipeline.hpp b/src/cpp/include/openvino/genai/text2image/pipeline.hpp index 952e775f2d..5ce6a08b11 100644 --- a/src/cpp/include/openvino/genai/text2image/pipeline.hpp +++ b/src/cpp/include/openvino/genai/text2image/pipeline.hpp @@ -13,7 +13,9 @@ #include "openvino/genai/visibility.hpp" +#include "openvino/genai/lora_adapter.hpp" #include "openvino/genai/text2image/clip_text_model.hpp" +#include "openvino/genai/text2image/clip_text_model_with_projection.hpp" #include "openvino/genai/text2image/unet2d_condition_model.hpp" #include "openvino/genai/text2image/autoencoder_kl.hpp" @@ -53,7 +55,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { AUTO, LCM, LMS_DISCRETE, - DDIM + DDIM, + EULER_DISCRETE }; static std::shared_ptr from_config(const std::string& scheduler_config_path, @@ -81,6 +84,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { int64_t width = -1; size_t num_inference_steps = 50; + AdapterConfig adapters; + void update_generation_config(const ov::AnyMap& config_map); // checks whether is config is valid @@ -96,6 +101,13 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { Text2ImagePipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties = {}); + template ::value, bool>::type = true> + Text2ImagePipeline(const std::string& root_dir, + const std::string& device, + Properties&&... properties) + : Text2ImagePipeline(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } + // creates either LCM or SD pipeline from building blocks static Text2ImagePipeline stable_diffusion( const std::shared_ptr& scheduler_type, @@ -110,6 +122,14 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { const UNet2DConditionModel& unet, const AutoencoderKL& vae_decoder); + // creates SDXL pipeline from building blocks + static Text2ImagePipeline stable_diffusion_xl( + const std::shared_ptr& scheduler_type, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder); + GenerationConfig get_generation_config() const; void set_generation_config(const GenerationConfig& generation_config); @@ -138,6 +158,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { explicit Text2ImagePipeline(const std::shared_ptr& impl); class StableDiffusionPipeline; + class StableDiffusionXLPipeline; }; // diff --git a/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp b/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp index b5b5288049..b3cfe1d364 100644 --- a/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp +++ b/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp @@ -14,6 +14,7 @@ #include "openvino/runtime/tensor.hpp" #include "openvino/runtime/infer_request.hpp" #include "openvino/runtime/properties.hpp" +#include "openvino/genai/lora_adapter.hpp" namespace ov { namespace genai { @@ -61,10 +62,13 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel { void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states); + void set_adapters(const AdapterConfig& adapters); + ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep); private: Config m_config; + AdapterController m_adapter_controller; std::shared_ptr m_model; ov::InferRequest m_request; size_t m_vae_scale_factor; diff --git a/src/cpp/src/clip.hpp b/src/cpp/src/clip.hpp index c8965a4890..99c06a05d2 100644 --- a/src/cpp/src/clip.hpp +++ b/src/cpp/src/clip.hpp @@ -25,6 +25,8 @@ struct clip_ctx { std::vector buf_compute_meta; projector_type proj_type = PROJECTOR_TYPE_RESAMPLER; + size_t patch_size = 0; + size_t image_size = 0; }; // RGB uint8 image diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 94a05dd587..ff7ceb051e 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -16,6 +16,7 @@ #include "utils.hpp" #include "text_callback_streamer.hpp" #include "openvino/genai/lora_adapter.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -76,12 +77,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(model_path)) { ov::Core core; - auto adapters_iter = plugin_config.find(ov::genai::adapters.name()); - if (adapters_iter != plugin_config.end()) { - m_generation_config.adapters = adapters_iter->second.as(); - auto filtered_plugin_config = plugin_config; - filtered_plugin_config.erase(ov::genai::adapters.name()); - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(filtered_plugin_config); + if(auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { + auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config); core.set_property(core_plugin_config); auto model = core.read_model(model_path / "openvino_model.xml"); m_adapter_controller = AdapterController(model, m_generation_config.adapters, "base_model.model.model.", device); // TODO: Make the prefix name configurable diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 0c75ad30b4..e330693c5d 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -52,6 +52,36 @@ void align_u4_zp_constants(const std::shared_ptr& model) { } } +bool allow_to_enable_npuw_dq(const std::shared_ptr& model) { + std::vector rt_info_path = {"nncf", "weight_compression", "group_size"}; + if (!model->has_rt_info(rt_info_path)) { + // NB: Model isn't compressed by NNCF - skip + return false; + } + auto group_size = model->get_rt_info(rt_info_path); + if (group_size == -1) { + // NB: Enable DQ for CW quantized models + return true; + } + return false; +} + +std::optional pop_option(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + config.erase(it); + return std::make_optional(it->second); + } + return std::nullopt; +} + +void enable_npuw_dq_if_allowed(ov::AnyMap& config, + const std::shared_ptr& model) { + if (allow_to_enable_npuw_dq(model)) { + config["NPUW_DQ"] = "YES"; + pop_option(config, "NPUW_ONLINE_AVOID"); + } +} + std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { const auto kStartOutputKVCacheLayers = 1u; for (int i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { @@ -182,19 +212,22 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { } } -ov::AnyMap get_default_prefill_config() { - std::map config = { +ov::AnyMap get_default_prefill_config(const std::shared_ptr& model) { + ov::AnyMap config = { { "NPU_USE_NPUW", "YES" }, { "NPUW_FOLD", "YES" }, { "NPUW_DCOFF_TYPE", "f16" }, { "NPUW_DCOFF_SCALE", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" }, { "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" } }; - return { config.begin(), config.end() }; + enable_npuw_dq_if_allowed(config, model); + return config; } -ov::AnyMap get_default_generate_config() { - std::map config = { +ov::AnyMap get_default_generate_config(const std::shared_ptr& model) { + ov::AnyMap config = { { "NPU_USE_NPUW", "YES" }, { "NPUW_FOLD", "YES" }, { "NPUW_DCOFF_TYPE", "f16" }, @@ -202,17 +235,18 @@ ov::AnyMap get_default_generate_config() { { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" }, { "NPUW_PARALLEL_COMPILE", "YES" }, { "NPUW_FUNCALL_ASYNC", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, { "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" } }; - return { config.begin(), config.end() }; + enable_npuw_dq_if_allowed(config, model); + return config; } template T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { - if (auto it = config.find(key); it != config.end()) { - auto value = it->second; - config.erase(it); - return value.as(); + auto anyopt = pop_option(config, key); + if (anyopt.has_value()) { + return anyopt.value().as(); } return default_value; } @@ -227,9 +261,7 @@ ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, si void drop_cache_dir(ov::AnyMap& config) { if (config.count("NPU_USE_NPUW") != 0u) { - if (auto it = config.find("CACHE_DIR"); it != config.end()) { - config.erase(it); - } + pop_option(config, "CACHE_DIR"); } } @@ -312,13 +344,18 @@ void StaticLLMPipeline::setupAndCompileModels( reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); // (8) Compile both model - auto prefill_config = pop_or_default(pipeline_config, "PREFILL_CONFIG", get_default_prefill_config()); - auto generate_config = pop_or_default(pipeline_config, "GENERATE_CONFIG", get_default_generate_config()); + auto prefill_config = pop_or_default( + pipeline_config, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model) + ); + auto generate_config = pop_or_default( + pipeline_config, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model) + ); merge_config_with(prefill_config, pipeline_config); merge_config_with(generate_config, pipeline_config); // FIXME: Drop CACHE_DIR option if NPUW is enabled drop_cache_dir(prefill_config); drop_cache_dir(generate_config); + m_prefill_request = core.compile_model( m_prefill_model, device, prefill_config ).create_infer_request(); @@ -342,7 +379,7 @@ void StaticLLMPipeline::setupAndImportModels( */ ov::Core core; - auto import_blob = [this, + auto import_blob = [this, &path, &pipeline_config, &core, @@ -397,8 +434,8 @@ void StaticLLMPipeline::setupAndImportModels( // (4) Fill in m_kvcache_desc const uint32_t kMaxPromptLen = get_kvcache_size(prefill_model); const uint32_t kMinResponseLen = get_kvcache_size(generate_model) - kMaxPromptLen; - // FIXME For some models KV-cache dim != 2u - m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, 2u }; + // FIXME For some models KV-cache dim != 2u + m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, 2u }; } void StaticLLMPipeline::start_chat(const std::string& system_message) { diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp index 216fca98a4..2bfd1d5ca1 100644 --- a/src/cpp/src/lora_adapter.cpp +++ b/src/cpp/src/lora_adapter.cpp @@ -491,7 +491,7 @@ class LoRATransformBase : public ov::pass::MatcherPass { // Builds LoRA subgraph that consists of several matrix and element-wise multiplications with optional data type conversions and reshapes // to build a consistent graph. -NodePtr tensors_multiplication(NodePtr input, const NodeVector multipliers, ov::Output target, bool transpose_weights, size_t alpha_pos) { +NodePtr tensors_multiplication(NodePtr input, const NodeVector multipliers, ov::Output target, bool transpose_weights, size_t alpha_pos, bool transpose_in_end) { const auto target_type = target.get_element_type(); const auto target_shape = target.get_partial_shape(); const auto target_rank = target_shape.rank().get_length(); @@ -516,7 +516,7 @@ NodePtr tensors_multiplication(NodePtr input, const NodeVector multipliers, ov:: } } - if(target_rank == 4 && target_shape[-1].is_static() && target_shape[-1].get_length() > 1) { // FIXME: Check all potentially permuted dimensions, not only the last one + if(transpose_in_end) { // FIXME: Check the dimensions we really need to move, currently it is hardcoded 2 + 2 dimensions that usually appears in 2D Convolution case // where we need to apply LoRA for the first two dimensions (channels) while interpreting two last dimensions (spatial ) // TODO: Stash transposition constant to reuse @@ -648,7 +648,7 @@ class LoRAFuseTransform : public LoRATransformBase { for(auto multiplier : adapter) { parameters.push_back(std::make_shared(multiplier->get_output_element_type(0), multiplier->get_output_partial_shape(0))); } - auto result = std::make_shared(tensors_multiplication(nullptr, NodeVector{parameters.begin() + 1, parameters.end()}, target, false, 1)); + auto result = std::make_shared(tensors_multiplication(nullptr, NodeVector{parameters.begin() + 1, parameters.end()}, target, false, 1, false)); auto weights_model = std::make_shared(ov::ResultVector{result}, parameters); fusers.insert(signature, weights_model); } @@ -699,6 +699,7 @@ class LoRASeparateTransform : public LoRATransformBase { auto target_rank = target.get_partial_shape().rank().get_length(); auto consumers = target.get_target_inputs(); + bool transpose_in_end = false; // FIXME: Should check rank of activations instead of target rank if(target_rank == 4 && target.get_partial_shape()[target_rank - 3].get_length() > 1) { @@ -707,10 +708,11 @@ class LoRASeparateTransform : public LoRATransformBase { auto transposition = v0::Constant::create(ov::element::i32, ov::Shape{4}, std::vector{2, 3, 0, 1}); auto transpose = register_new_node(activations, transposition); activations = transpose; + transpose_in_end = true; } NodeVector lora_variables{lora_weight.A, lora_weight.alpha, lora_weight.B}; - replacement = tensors_multiplication(activations.get_node_shared_ptr(), lora_variables, target, true, 1); + replacement = tensors_multiplication(activations.get_node_shared_ptr(), lora_variables, target, true, 1, transpose_in_end); for (auto consumer : consumers) { consumer.replace_source_output(replacement->output(0)); @@ -843,7 +845,7 @@ struct AdapterControllerImpl { } struct ConfigChanged { - bool mode; + bool mode = false; bool alpha = false; bool adapter = false; @@ -872,25 +874,28 @@ struct AdapterControllerImpl { return diff; } - void apply (ov::InferRequest& infer_request, const AdapterConfig& config) { + void apply (ov::InferRequest& infer_request, std::optional config) { // FIXME: If a part of LoRA state tensors are not set here, then need to carefully reset state in LLMPipeline where global reset is called after the generation - - const auto diff = compare_configs(current_config, config); - OPENVINO_ASSERT( - !diff.mode || config.get_mode() == AdapterConfig::MODE_AUTO, // MODE_AUTO in this call means that mode is not changed - "AdapterConfig::mode cannot be changed and should be configured once for a model at the initialization"); - OPENVINO_ASSERT( - config.get_mode() == AdapterConfig::MODE_AUTO || config.get_mode() == AdapterConfig::MODE_DYNAMIC || config.get_mode() == AdapterConfig::MODE_STATIC_RANK || (!diff.alpha && !diff.adapter), - "Cannot change adapters and/or the alphas when not one of the dynamic modes are used."); + ConfigChanged diff; + if(config) { + diff = compare_configs(current_config, *config); + OPENVINO_ASSERT( + !diff.mode || config->get_mode() == AdapterConfig::MODE_AUTO, // MODE_AUTO in this call means that mode is not changed + "AdapterConfig::mode cannot be changed and should be configured once for a model at the initialization"); + OPENVINO_ASSERT( + config->get_mode() == AdapterConfig::MODE_AUTO || config->get_mode() == AdapterConfig::MODE_DYNAMIC || config->get_mode() == AdapterConfig::MODE_STATIC_RANK || (!diff.alpha && !diff.adapter), + "Cannot change adapters and/or the alphas when not one of the dynamic modes are used."); + current_config = *config; + } if(need_full_apply) { need_full_apply = false; - set_new_adapter_tensors(infer_request, config); + set_new_adapter_tensors(infer_request); } else if(diff) { if(diff.adapter) { - set_new_adapter_tensors(infer_request, config); + set_new_adapter_tensors(infer_request); } else { OPENVINO_ASSERT(diff.alpha); - set_new_adapter_alphas(infer_request, config); + set_new_adapter_alphas(infer_request); } } } @@ -899,13 +904,12 @@ struct AdapterControllerImpl { need_full_apply = full_apply; } - void set_new_adapter_alphas (ov::InferRequest& infer_request, const AdapterConfig& config) { + void set_new_adapter_alphas (ov::InferRequest& infer_request) { // FIXME: Provide more economical way to update only alphas - set_new_adapter_tensors(infer_request, config); + set_new_adapter_tensors(infer_request); } - void set_new_adapter_tensors (ov::InferRequest& infer_request, const AdapterConfig& config) { - current_config = config; // FIXME: Keep the old config to map to cached LoRA state tensors instead of the current approach where we start from scratch each time + void set_new_adapter_tensors (ov::InferRequest& infer_request) { if(current_config.get_mode() != AdapterConfig::MODE_AUTO && current_config.get_mode() != AdapterConfig::MODE_DYNAMIC && current_config.get_mode() != AdapterConfig::MODE_STATIC_RANK ) { return; } @@ -1163,10 +1167,6 @@ struct AdapterControllerImpl { } return new_tensors; } - - void apply (ov::InferRequest& infer_request) { - return apply(infer_request, current_config); - } }; @@ -1207,13 +1207,13 @@ AdapterController::AdapterController(std::shared_ptr model, const Ada // Call it every time when adapter config is changed; if adapter was configured as a static one, this call is not required -void AdapterController::apply(ov::InferRequest& request, const AdapterConfig& config) { - return m_pimpl->apply(request, config); -} - - -void AdapterController::apply(ov::InferRequest& request){ - return m_pimpl->apply(request); +void AdapterController::apply(ov::InferRequest& request, const std::optional& config) { + OPENVINO_ASSERT(m_pimpl || !config || !*config, + "Adapters are passed to AdapterController but it was not configured to use adapters. " + "Enable using adapters by pass them in the constructor first."); + if (m_pimpl) { + m_pimpl->apply(request, config); + } } diff --git a/src/cpp/src/lora_helper.cpp b/src/cpp/src/lora_helper.cpp new file mode 100644 index 0000000000..7e7a6e613c --- /dev/null +++ b/src/cpp/src/lora_helper.cpp @@ -0,0 +1,28 @@ +#include "lora_helper.hpp" + + +namespace ov { +namespace genai { + +std::optional extract_adapters_from_properties (const AnyMap& properties, AdapterConfig* adapter_config) { + auto adapters_iter = properties.find(AdaptersProperty::name()); + if (adapters_iter != properties.end()) { + if(adapter_config) { + *adapter_config = adapters_iter->second.as(); + } + auto filtered_properties = properties; + filtered_properties.erase(AdaptersProperty::name()); + return filtered_properties; + } + return std::nullopt; +} + +void update_adapters_from_properties (const AnyMap& properties, AdapterConfig& adapter_config) { + auto adapters_iter = properties.find(AdaptersProperty::name()); + if (adapters_iter != properties.end()) { + adapter_config = adapters_iter->second.as(); + } +} + +} +} \ No newline at end of file diff --git a/src/cpp/src/lora_helper.hpp b/src/cpp/src/lora_helper.hpp new file mode 100644 index 0000000000..b9e41e8b4c --- /dev/null +++ b/src/cpp/src/lora_helper.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "openvino/genai/lora_adapter.hpp" + + +namespace ov { +namespace genai { + +// Search for `adapters` property in `properties` map. If it is found and `adapter_config` is not nullptr, +// set `adapter_config` with found value, and return a copy of `properties` with the `adapters` property removed. +// If there is no `adapters` property, `adapter_config` is left unchanged and std::nullopt is returned. +std::optional extract_adapters_from_properties (const AnyMap& properties, AdapterConfig* adapter_config = nullptr); + +// Search for `adapters` property in `properties` map. If it is found, set `adapter_config` with found value. +// If `adapters` property is not found, do nothing. +void update_adapters_from_properties (const AnyMap& properties, AdapterConfig& adapter_config); + +} +} \ No newline at end of file diff --git a/src/cpp/src/text2image/diffusion_pipeline.hpp b/src/cpp/src/text2image/diffusion_pipeline.hpp index 41dce0e030..1884df4ca6 100644 --- a/src/cpp/src/text2image/diffusion_pipeline.hpp +++ b/src/cpp/src/text2image/diffusion_pipeline.hpp @@ -1,6 +1,8 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include "text2image/schedulers/ischeduler.hpp" diff --git a/src/cpp/src/text2image/models/autoencoder_kl.cpp b/src/cpp/src/text2image/models/autoencoder_kl.cpp index e9aec9528f..30b378963b 100644 --- a/src/cpp/src/text2image/models/autoencoder_kl.cpp +++ b/src/cpp/src/text2image/models/autoencoder_kl.cpp @@ -14,6 +14,7 @@ #include "openvino/op/constant.hpp" #include "utils.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -43,7 +44,11 @@ AutoencoderKL::AutoencoderKL(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) : AutoencoderKL(root_dir) { - compile(device, properties); + if(auto filtered_properties = extract_adapters_from_properties(properties)) { + compile(device, *filtered_properties); + } else { + compile(device, properties); + } } AutoencoderKL::AutoencoderKL(const AutoencoderKL&) = default; diff --git a/src/cpp/src/text2image/models/clip_text_model.cpp b/src/cpp/src/text2image/models/clip_text_model.cpp index d42a07f9c1..b8ec871eb0 100644 --- a/src/cpp/src/text2image/models/clip_text_model.cpp +++ b/src/cpp/src/text2image/models/clip_text_model.cpp @@ -8,6 +8,7 @@ #include "openvino/runtime/core.hpp" #include "utils.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -21,6 +22,7 @@ CLIPTextModel::Config::Config(const std::string& config_path) { read_json_param(data, "max_position_embeddings", max_position_embeddings); read_json_param(data, "hidden_size", hidden_size); + read_json_param(data, "num_hidden_layers", num_hidden_layers); } CLIPTextModel::CLIPTextModel(const std::string root_dir) : @@ -33,7 +35,13 @@ CLIPTextModel::CLIPTextModel(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) : CLIPTextModel(root_dir) { - compile(device, properties); + AdapterConfig adapters; + if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { + m_adapter_controller = AdapterController(m_model, adapters, "lora_te", device); + compile(device, *filtered_properties); + } else { + compile(device, properties); + } } CLIPTextModel::CLIPTextModel(const CLIPTextModel&) = default; @@ -64,6 +72,10 @@ CLIPTextModel& CLIPTextModel::compile(const std::string& device, const ov::AnyMa return *this; } +void CLIPTextModel::set_adapters(const AdapterConfig& adapters) { + m_adapter_controller.apply(m_request, adapters); +} + ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) { OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); @@ -100,5 +112,9 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string return m_request.get_output_tensor(0); } +ov::Tensor CLIPTextModel::get_output_tensor(const size_t idx) { + return m_request.get_output_tensor(idx); +} + } // namespace genai } // namespace ov diff --git a/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp b/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp new file mode 100644 index 0000000000..2fa7b83738 --- /dev/null +++ b/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/clip_text_model_with_projection.hpp" + +#include + +#include "openvino/runtime/core.hpp" + +#include "utils.hpp" + +namespace ov { +namespace genai { + +CLIPTextModelWithProjection::Config::Config(const std::string& config_path) { + std::ifstream file(config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "max_position_embeddings", max_position_embeddings); + read_json_param(data, "hidden_size", hidden_size); + read_json_param(data, "num_hidden_layers", num_hidden_layers); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string root_dir) : + m_clip_tokenizer(root_dir + "/../tokenizer_2"), + m_config(root_dir + "/config.json") { + m_model = ov::Core().read_model(root_dir + "/openvino_model.xml"); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties) : + CLIPTextModelWithProjection(root_dir) { + compile(device, properties); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const CLIPTextModelWithProjection&) = default; + +const CLIPTextModelWithProjection::Config& CLIPTextModelWithProjection::get_config() const { + return m_config; +} + +CLIPTextModelWithProjection& CLIPTextModelWithProjection::reshape(int batch_size) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); + + ov::PartialShape input_shape = m_model->input(0).get_partial_shape(); + input_shape[0] = batch_size; + input_shape[1] = m_config.max_position_embeddings; + std::map idx_to_shape{{0, input_shape}}; + m_model->reshape(idx_to_shape); + + return *this; +} + +CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::string& device, const ov::AnyMap& properties) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); + ov::CompiledModel compiled_model = ov::Core().compile_model(m_model, device, properties); + m_request = compiled_model.create_infer_request(); + // release the original model + m_model.reset(); + + return *this; +} + +ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) { + OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); + + const int32_t pad_token_id = m_clip_tokenizer.get_pad_token_id(); + const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1; + + auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) { + std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); + + ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids; + std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + }; + + ov::Tensor input_ids(ov::element::i64, {text_embedding_batch_size, m_config.max_position_embeddings}); + size_t current_batch_idx = 0; + + if (do_classifier_free_guidance) { + perform_tokenization(neg_prompt, + ov::Tensor(input_ids, {current_batch_idx , 0}, + {current_batch_idx + 1, m_config.max_position_embeddings})); + ++current_batch_idx; + } else { + // Negative prompt is ignored when --guidanceScale < 1.0 + } + + perform_tokenization(pos_prompt, + ov::Tensor(input_ids, {current_batch_idx , 0}, + {current_batch_idx + 1, m_config.max_position_embeddings})); + + // text embeddings + m_request.set_tensor("input_ids", input_ids); + m_request.infer(); + + return m_request.get_output_tensor(0); +} + +ov::Tensor CLIPTextModelWithProjection::get_output_tensor(const size_t idx) { + return m_request.get_output_tensor(idx); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/models/unet2d_condition_model.cpp b/src/cpp/src/text2image/models/unet2d_condition_model.cpp index 69563fb7c4..d356515678 100644 --- a/src/cpp/src/text2image/models/unet2d_condition_model.cpp +++ b/src/cpp/src/text2image/models/unet2d_condition_model.cpp @@ -8,6 +8,7 @@ #include "openvino/runtime/core.hpp" #include "utils.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -36,7 +37,13 @@ UNet2DConditionModel::UNet2DConditionModel(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) : UNet2DConditionModel(root_dir) { - compile(device, properties); + AdapterConfig adapters; + if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { + m_adapter_controller = AdapterController(m_model, adapters, "lora_unet", device); + compile(device, *filtered_properties); + } else { + compile(device, properties); + } } UNet2DConditionModel::UNet2DConditionModel(const UNet2DConditionModel&) = default; @@ -64,7 +71,7 @@ UNet2DConditionModel& UNet2DConditionModel::reshape(int batch_size, int height, name_to_shape[input_name][0] = 1; } else if (input_name == "sample") { name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width}; - } else if (input_name == "time_ids") { + } else if (input_name == "time_ids" || input_name == "text_embeds") { name_to_shape[input_name][0] = batch_size; } else if (input_name == "encoder_hidden_states") { name_to_shape[input_name][0] = batch_size; @@ -92,6 +99,10 @@ void UNet2DConditionModel::set_hidden_states(const std::string& tensor_name, ov: m_request.set_tensor(tensor_name, encoder_hidden_states); } +void UNet2DConditionModel::set_adapters(const AdapterConfig& adapters) { + m_adapter_controller.apply(m_request, adapters); +} + ov::Tensor UNet2DConditionModel::infer(ov::Tensor sample, ov::Tensor timestep) { OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model"); diff --git a/src/cpp/src/text2image/numpy_utils.cpp b/src/cpp/src/text2image/numpy_utils.cpp new file mode 100644 index 0000000000..9554681820 --- /dev/null +++ b/src/cpp/src/text2image/numpy_utils.cpp @@ -0,0 +1,79 @@ +#include "text2image/numpy_utils.hpp" +#include "openvino/core/except.hpp" + +namespace ov { +namespace genai { +namespace numpy_utils { + +void rescale_zero_terminal_snr(std::vector& betas) { + // Convert betas to alphas_bar_sqrt + std::vector alphas, alphas_bar_sqrt; + for (float b : betas) { + alphas.push_back(1.0f - b); + } + + for (size_t i = 1; i <= alphas.size(); ++i) { + float alpha_cumprod = + std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies{}); + alphas_bar_sqrt.push_back(std::sqrt(alpha_cumprod)); + } + + float alphas_bar_sqrt_0 = alphas_bar_sqrt[0]; + float alphas_bar_sqrt_T = alphas_bar_sqrt[alphas_bar_sqrt.size() - 1]; + + for (float& x : alphas_bar_sqrt) { + // Shift so the last timestep is zero. + x = x - alphas_bar_sqrt_T; + // Scale so the first timestep is back to the old value. + x *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T); + // Revert sqrt + x = std::pow(x, 2); + } + + // Revert cumprod + std::vector end = alphas_bar_sqrt, begin = alphas_bar_sqrt; + end.erase(end.begin()); + begin.pop_back(); + + alphas[0] = alphas_bar_sqrt[0]; + for (size_t i = 1; i < alphas.size(); ++i) { + alphas[i] = end[i - 1] / begin[i - 1]; + } + + std::transform(alphas.begin(), alphas.end(), betas.begin(), [](float x) { + return (1 - x); + }); +} + +std::vector interp(const std::vector& x, const std::vector& xp, const std::vector& fp) { + OPENVINO_ASSERT(xp.size() == fp.size(), "`xp` and `fp`vectors must have the same sizes"); + + std::vector interp_res; + + for (const auto& i : x) { + if (i <= xp[0]) { + interp_res.push_back(fp[0]); + } else if (i >= xp[xp.size() - 1]) { + interp_res.push_back(fp[fp.size() - 1]); + } else { + // Find the first xp element that is not less than x[i] + auto it = std::lower_bound(xp.begin(), xp.end(), i); + + // idx of the left boundary + size_t idx = std::distance(xp.begin(), it) - 1; + + float x0 = xp[idx], x1 = xp[idx + 1]; + float y0 = fp[idx], y1 = fp[idx + 1]; + + float interp_val = (y1 - y0) / (x1 - x0) * (i - x0) + y0; + + interp_res.push_back(interp_val); + } + } + + return interp_res; +} + +} // namespace ov +} // namespace genai +} // namespace numpy_utils diff --git a/src/cpp/src/text2image/numpy_utils.hpp b/src/cpp/src/text2image/numpy_utils.hpp index 4520d35ae8..d6144eeb99 100644 --- a/src/cpp/src/text2image/numpy_utils.hpp +++ b/src/cpp/src/text2image/numpy_utils.hpp @@ -4,6 +4,11 @@ #pragma once #include +#include +#include +#include +#include +#include namespace ov { namespace genai { @@ -31,6 +36,12 @@ std::vector linspace(U start, U end, size_t num, bool endpoint = false) { return indices; } -}// namespace ov -}// namespace genai -}// namespace txt2img_utils +// Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) +void rescale_zero_terminal_snr(std::vector& betas); + +// np.interp(...) implementation +std::vector interp(const std::vector& x, const std::vector& xp, const std::vector& fp); + +} // namespace ov +} // namespace genai +} // namespace numpy_utils diff --git a/src/cpp/src/text2image/schedulers/ddim.cpp b/src/cpp/src/text2image/schedulers/ddim.cpp index 470a8c8ec0..a25cf7227e 100644 --- a/src/cpp/src/text2image/schedulers/ddim.cpp +++ b/src/cpp/src/text2image/schedulers/ddim.cpp @@ -62,6 +62,7 @@ DDIMScheduler::DDIMScheduler(const Config& scheduler_config) } if (m_config.rescale_betas_zero_snr) { + using numpy_utils::rescale_zero_terminal_snr; rescale_zero_terminal_snr(betas); } @@ -157,7 +158,7 @@ std::map DDIMScheduler::step(ov::Tensor noise_pred, ov: break; default: OPENVINO_THROW("Unsupported value for 'PredictionType'"); - } + } } // TODO: support m_config.thresholding @@ -197,45 +198,5 @@ void DDIMScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) return; } -void DDIMScheduler::rescale_zero_terminal_snr(std::vector& betas) { - // Convert betas to alphas_bar_sqrt - std::vector alphas, alphas_bar_sqrt; - for (float b : betas) { - alphas.push_back(1.0f - b); - } - - for (size_t i = 1; i <= alphas.size(); ++i) { - float alpha_cumprod = - std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies{}); - alphas_bar_sqrt.push_back(std::sqrt(alpha_cumprod)); - } - - float alphas_bar_sqrt_0 = alphas_bar_sqrt[0]; - float alphas_bar_sqrt_T = alphas_bar_sqrt[alphas_bar_sqrt.size() - 1]; - - for (float& x : alphas_bar_sqrt) { - // Shift so the last timestep is zero. - x = x - alphas_bar_sqrt_T; - // Scale so the first timestep is back to the old value. - x *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T); - // Revert sqrt - x = std::pow(x, 2); - } - - // Revert cumprod - std::vector end = alphas_bar_sqrt, begin = alphas_bar_sqrt; - end.erase(end.begin()); - begin.pop_back(); - - alphas[0] = alphas_bar_sqrt[0]; - for (size_t i = 1; i < alphas.size(); ++i) { - alphas[i] = end[i - 1] / begin[i - 1]; - } - - std::transform(alphas.begin(), alphas.end(), betas.begin(), [](float x) { - return (1 - x); - }); -} - } // namespace genai } // namespace ov diff --git a/src/cpp/src/text2image/schedulers/ddim.hpp b/src/cpp/src/text2image/schedulers/ddim.hpp index 062fc14ce5..936f4991ea 100644 --- a/src/cpp/src/text2image/schedulers/ddim.hpp +++ b/src/cpp/src/text2image/schedulers/ddim.hpp @@ -52,8 +52,6 @@ class DDIMScheduler : public IScheduler { size_t m_num_inference_steps; std::vector m_timesteps; - - void rescale_zero_terminal_snr(std::vector& betas); }; } // namespace genai diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.cpp b/src/cpp/src/text2image/schedulers/euler_discrete.cpp new file mode 100644 index 0000000000..9873a3998f --- /dev/null +++ b/src/cpp/src/text2image/schedulers/euler_discrete.cpp @@ -0,0 +1,281 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/schedulers/euler_discrete.hpp" + +#include +#include +#include +#include + +#include "text2image/numpy_utils.hpp" +#include "utils.hpp" + +namespace ov { +namespace genai { + +EulerDiscreteScheduler::Config::Config(const std::string& scheduler_config_path) { + std::ifstream file(scheduler_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "num_train_timesteps", num_train_timesteps); + read_json_param(data, "beta_start", beta_start); + read_json_param(data, "beta_end", beta_end); + read_json_param(data, "beta_schedule", beta_schedule); + read_json_param(data, "trained_betas", trained_betas); + read_json_param(data, "final_sigmas_type", final_sigmas_type); + read_json_param(data, "interpolation_type", interpolation_type); + read_json_param(data, "sigma_max", sigma_max); + read_json_param(data, "sigma_min", sigma_min); + read_json_param(data, "steps_offset", steps_offset); + read_json_param(data, "prediction_type", prediction_type); + read_json_param(data, "timestep_spacing", timestep_spacing); + read_json_param(data, "timestep_type", timestep_type); + read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); + read_json_param(data, "use_karras_sigmas", use_karras_sigmas); + read_json_param(data, "use_exponential_sigmas", use_exponential_sigmas); + read_json_param(data, "use_beta_sigmas", use_beta_sigmas); +} + +EulerDiscreteScheduler::EulerDiscreteScheduler(const std::string scheduler_config_path) + : EulerDiscreteScheduler(Config(scheduler_config_path)) {} + +EulerDiscreteScheduler::EulerDiscreteScheduler(const Config& scheduler_config) : m_config(scheduler_config) { + std::vector alphas, betas; + + using numpy_utils::linspace; + + if (!m_config.trained_betas.empty()) { + betas = m_config.trained_betas; + } else if (m_config.beta_schedule == BetaSchedule::LINEAR) { + betas = linspace(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps); + } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) { + float start = std::sqrt(m_config.beta_start); + float end = std::sqrt(m_config.beta_end); + betas = linspace(start, end, m_config.num_train_timesteps); + std::for_each(betas.begin(), betas.end(), [](float& x) { + x *= x; + }); + } else { + OPENVINO_THROW( + "'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types"); + } + + if (m_config.rescale_betas_zero_snr) { + using numpy_utils::rescale_zero_terminal_snr; + rescale_zero_terminal_snr(betas); + } + + std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [](float b) { + return 1.0f - b; + }); + + for (size_t i = 1; i <= alphas.size(); ++i) { + float alpha_cumprod = + std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies{}); + m_alphas_cumprod.push_back(alpha_cumprod); + } + + if (m_config.rescale_betas_zero_snr) { + m_alphas_cumprod.back() = std::pow(2, -24); + } + + for (auto it = m_alphas_cumprod.rbegin(); it != m_alphas_cumprod.rend(); ++it) { + float sigma = std::pow(((1 - (*it)) / (*it)), 0.5); + m_sigmas.push_back(sigma); + } + + auto linspaced = + linspace(0.0f, static_cast(m_config.num_train_timesteps - 1), m_config.num_train_timesteps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(static_cast(std::round(*it))); + } + + OPENVINO_ASSERT( + m_config.timestep_type != TimestepType::CONTINUOUS || m_config.prediction_type != PredictionType::V_PREDICTION, + "This case isn't supported: `timestep_type=continuous` and `prediction_type=v_prediction`. Please, add " + "support."); + + m_sigmas.push_back(0); +} + +void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps) { + // TODO: support `timesteps` and `sigmas` inputs + m_timesteps.clear(); + m_sigmas.clear(); + + m_num_inference_steps = num_inference_steps; + std::vector sigmas; + + OPENVINO_ASSERT( + m_config.timestep_type != TimestepType::CONTINUOUS || m_config.prediction_type != PredictionType::V_PREDICTION, + "This case isn't supported: `timestep_type=continuous` and `prediction_type=v_prediction`. Please, add " + "support."); + + switch (m_config.timestep_spacing) { + case TimestepSpacing::LINSPACE: { + using numpy_utils::linspace; + float end = static_cast(m_config.num_train_timesteps - 1); + auto linspaced = linspace(0.0f, end, num_inference_steps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(static_cast(std::round(*it))); + } + break; + } + case TimestepSpacing::LEADING: { + size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps; + for (size_t i = num_inference_steps - 1; i != -1; --i) { + m_timesteps.push_back(i * step_ratio + m_config.steps_offset); + } + break; + } + case TimestepSpacing::TRAILING: { + float step_ratio = static_cast(m_config.num_train_timesteps) / static_cast(m_num_inference_steps); + for (float i = m_config.num_train_timesteps; i > 0; i -= step_ratio) { + m_timesteps.push_back(static_cast(std::round(i)) - 1); + } + break; + } + default: + OPENVINO_THROW("Unsupported value for 'timestep_spacing'"); + } + + for (const float& i : m_alphas_cumprod) { + float sigma = std::pow(((1 - i) / i), 0.5); + sigmas.push_back(sigma); + } + + switch (m_config.interpolation_type) { + case InterpolationType::LINEAR: { + using numpy_utils::interp; + + std::vector x_data_points(sigmas.size()); + std::iota(x_data_points.begin(), x_data_points.end(), 0); + m_sigmas = interp(m_timesteps, x_data_points, sigmas); + break; + } + case InterpolationType::LOG_LINEAR: { + using numpy_utils::linspace; + + m_sigmas = linspace(std::log(sigmas.back()), std::log(sigmas[0]), num_inference_steps + 1, true); + std::transform(m_sigmas.begin(), m_sigmas.end(), m_sigmas.begin(), [](float x) { + return std::exp(x); + }); + break; + } + default: + OPENVINO_THROW("Unsupported value for 'interpolation_type'"); + } + + OPENVINO_ASSERT(!m_config.use_karras_sigmas, + "Parameter 'use_karras_sigmas' is not supported. Please, add support."); + + OPENVINO_ASSERT(!m_config.use_exponential_sigmas, + "Parameter 'use_exponential_sigmas' is not supported. Please, add support."); + + OPENVINO_ASSERT(!m_config.use_beta_sigmas, "Parameter 'use_beta_sigmas' is not supported. Please, add support."); + + float sigma_last = 0; + switch (m_config.final_sigmas_type) { + case FinalSigmaType::SIGMA_MIN: + sigma_last = std::pow(((1 - m_alphas_cumprod[0]) / m_alphas_cumprod[0]), 0.5); + break; + case FinalSigmaType::ZERO: + break; + default: + OPENVINO_THROW("Unsupported value for 'final_sigmas_type'"); + } + m_sigmas.push_back(sigma_last); +} + +std::map EulerDiscreteScheduler::step(ov::Tensor noise_pred, + ov::Tensor latents, + size_t inference_step) { + // noise_pred - model_output + // latents - sample + // inference_step + + size_t timestep = get_timesteps()[inference_step]; + + if (m_step_index == -1) + m_step_index = 0; + + float sigma = m_sigmas[m_step_index]; + // TODO: hardcoded gamma + float gamma = 0.0f; + float sigma_hat = sigma * (gamma + 1); + + float* model_output_data = noise_pred.data(); + float* sample_data = latents.data(); + + ov::Tensor pred_original_sample(noise_pred.get_element_type(), noise_pred.get_shape()); + float* pred_original_sample_data = pred_original_sample.data(); + + ov::Tensor prev_sample(noise_pred.get_element_type(), noise_pred.get_shape()); + float* prev_sample_data = prev_sample.data(); + + // 1. compute predicted original sample (x_0) from sigma-scaled predicted noise + switch (m_config.prediction_type) { + case PredictionType::EPSILON: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = sample_data[i] - model_output_data[i] * sigma_hat; + } + break; + case PredictionType::SAMPLE: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = model_output_data[i]; + } + break; + case PredictionType::V_PREDICTION: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = model_output_data[i] * (-sigma / std::pow((std::pow(sigma, 2) + 1), 0.5)) + + (sample_data[i] / (std::pow(sigma, 2) + 1)); + } + break; + default: + OPENVINO_THROW("Unsupported value for 'PredictionType'"); + } + + float dt = m_sigmas[m_step_index + 1] - sigma_hat; + + // 2. Convert to an ODE derivative + for (size_t i = 0; i < prev_sample.get_size(); ++i) { + prev_sample_data[i] = ((sample_data[i] - pred_original_sample_data[i]) / sigma_hat) * dt + sample_data[i]; + } + + m_step_index += 1; + + return {{"latent", prev_sample}, {"denoised", pred_original_sample}}; +} + +std::vector EulerDiscreteScheduler::get_timesteps() const { + return m_timesteps; +} + +float EulerDiscreteScheduler::get_init_noise_sigma() const { + float max_sigma = *std::max_element(m_sigmas.begin(), m_sigmas.end()); + + if (m_config.timestep_spacing == TimestepSpacing::LINSPACE || + m_config.timestep_spacing == TimestepSpacing::TRAILING) { + return max_sigma; + } + + return std::sqrt(max_sigma * max_sigma + 1); +} + +void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) { + if (m_step_index == -1) + m_step_index = 0; + + float sigma = m_sigmas[m_step_index]; + float* sample_data = sample.data(); + for (size_t i = 0; i < sample.get_size(); i++) { + sample_data[i] /= std::pow((std::pow(sigma, 2) + 1), 0.5); + } +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.hpp b/src/cpp/src/text2image/schedulers/euler_discrete.hpp new file mode 100644 index 0000000000..1dc60f118f --- /dev/null +++ b/src/cpp/src/text2image/schedulers/euler_discrete.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include "text2image/schedulers/types.hpp" +#include "text2image/schedulers/ischeduler.hpp" + +namespace ov { +namespace genai { + +class EulerDiscreteScheduler : public IScheduler { +public: + struct Config { + int32_t num_train_timesteps = 1000; + float beta_start = 0.0001f, beta_end = 0.02f; + BetaSchedule beta_schedule = BetaSchedule::SCALED_LINEAR; + std::vector trained_betas = {}; + FinalSigmaType final_sigmas_type = FinalSigmaType::ZERO; + InterpolationType interpolation_type = InterpolationType::LINEAR; + float sigma_max = 0.0f, sigma_min = 0.0f; + size_t steps_offset = 0; + PredictionType prediction_type = PredictionType::EPSILON; + TimestepSpacing timestep_spacing = TimestepSpacing::LEADING; + TimestepType timestep_type = TimestepType::DISCRETE; + bool rescale_betas_zero_snr = false; + bool use_karras_sigmas = false, use_exponential_sigmas = false, use_beta_sigmas = false; + + Config() = default; + explicit Config(const std::string& scheduler_config_path); + }; + + explicit EulerDiscreteScheduler(const std::string scheduler_config_path); + explicit EulerDiscreteScheduler(const Config& scheduler_config); + + void set_timesteps(size_t num_inference_steps) override; + + std::vector get_timesteps() const override; + + float get_init_noise_sigma() const override; + + void scale_model_input(ov::Tensor sample, size_t inference_step) override; + + std::map step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override; + +private: + Config m_config; + + std::vector m_alphas_cumprod, m_sigmas; + std::vector m_timesteps; + size_t m_num_inference_steps; + + size_t m_step_index = -1; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/scheduler.cpp b/src/cpp/src/text2image/schedulers/scheduler.cpp index cf14640b7c..44b08d67fc 100644 --- a/src/cpp/src/text2image/schedulers/scheduler.cpp +++ b/src/cpp/src/text2image/schedulers/scheduler.cpp @@ -10,6 +10,7 @@ #include "text2image/schedulers/lcm.hpp" #include "text2image/schedulers/lms_discrete.hpp" #include "text2image/schedulers/ddim.hpp" +#include "text2image/schedulers/euler_discrete.hpp" namespace ov { namespace genai { @@ -35,6 +36,8 @@ std::shared_ptr Text2ImagePipeline::Scheduler::fr scheduler = std::make_shared(scheduler_config_path); } else if (scheduler_type == Scheduler::Type::DDIM) { scheduler = std::make_shared(scheduler_config_path); + } else if (scheduler_type == Scheduler::Type::EULER_DISCRETE) { + scheduler = std::make_shared(scheduler_config_path); } else { OPENVINO_THROW("Unsupported scheduler type '", scheduler_type, ". Please, manually create scheduler via supported one"); } diff --git a/src/cpp/src/text2image/schedulers/types.cpp b/src/cpp/src/text2image/schedulers/types.cpp index 4ecdcea811..0ca970f359 100644 --- a/src/cpp/src/text2image/schedulers/types.cpp +++ b/src/cpp/src/text2image/schedulers/types.cpp @@ -49,6 +49,8 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Text2I param = Text2ImagePipeline::Scheduler::DDIM; else if (scheduler_type_str == "LMSDiscreteScheduler") param = Text2ImagePipeline::Scheduler::LMS_DISCRETE; + else if (scheduler_type_str == "EulerDiscreteScheduler") + param = Text2ImagePipeline::Scheduler::EULER_DISCRETE; else if (!scheduler_type_str.empty()) { OPENVINO_THROW("Unsupported value for 'prediction_type' ", scheduler_type_str); } @@ -71,6 +73,48 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Timest } } +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, InterpolationType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string interpolation_type = data[name].get(); + if (interpolation_type == "linear") + param = InterpolationType::LINEAR; + else if (interpolation_type == "log_linear") + param = InterpolationType::LOG_LINEAR; + else if (!interpolation_type.empty()) { + OPENVINO_THROW("Unsupported value for 'interpolation_type' ", interpolation_type); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, FinalSigmaType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string final_sigma_type = data[name].get(); + if (final_sigma_type == "zero") + param = FinalSigmaType::ZERO; + else if (final_sigma_type == "sigma_min") + param = FinalSigmaType::SIGMA_MIN; + else if (!final_sigma_type.empty()) { + OPENVINO_THROW("Unsupported value for 'final_sigma_type' ", final_sigma_type); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, TimestepType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string timestep_type = data[name].get(); + if (timestep_type == "discrete") + param = TimestepType::DISCRETE; + else if (timestep_type == "continuous") + param = TimestepType::CONTINUOUS; + else if (!timestep_type.empty()) { + OPENVINO_THROW("Unsupported value for 'timestep_type' ", timestep_type); + } + } +} + } // namespace utils } // namespace genai } // namespace ov @@ -83,6 +127,8 @@ std::ostream& operator<<(std::ostream& os, const ov::genai::Text2ImagePipeline:: return os << "LMSDiscreteScheduler"; case ov::genai::Text2ImagePipeline::Scheduler::Type::DDIM: return os << "DDIMScheduler"; + case ov::genai::Text2ImagePipeline::Scheduler::Type::EULER_DISCRETE: + return os << "EulerDiscreteScheduler"; case ov::genai::Text2ImagePipeline::Scheduler::Type::AUTO: return os << "AutoScheduler"; default: diff --git a/src/cpp/src/text2image/schedulers/types.hpp b/src/cpp/src/text2image/schedulers/types.hpp index 3029998f95..74fde4f993 100644 --- a/src/cpp/src/text2image/schedulers/types.hpp +++ b/src/cpp/src/text2image/schedulers/types.hpp @@ -30,6 +30,21 @@ enum class TimestepSpacing { LEADING }; +enum class InterpolationType { + LINEAR, + LOG_LINEAR +}; + +enum class FinalSigmaType { + ZERO, + SIGMA_MIN +}; + +enum class TimestepType { + DISCRETE, + CONTINUOUS +}; + namespace utils { template <> @@ -44,6 +59,15 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Text2I template <> void read_json_param(const nlohmann::json& data, const std::string& name, TimestepSpacing& param); +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, InterpolationType& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, FinalSigmaType& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, TimestepType& param); + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/text2image/stable_diffusion_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp index 84251cb21f..54d2d43c19 100644 --- a/src/cpp/src/text2image/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp @@ -7,6 +7,7 @@ #include #include "utils.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -106,6 +107,8 @@ class Text2ImagePipeline::StableDiffusionPipeline : public Text2ImagePipeline::D // initialize generation config initialize_generation_config(data["_class_name"].get()); + + update_adapters_from_properties(properties, m_generation_config.adapters); } StableDiffusionPipeline( @@ -149,6 +152,9 @@ class Text2ImagePipeline::StableDiffusionPipeline : public Text2ImagePipeline::D generation_config.width = unet_config.sample_size * vae_scale_factor; check_inputs(generation_config.height, generation_config.width); + m_clip_text_encoder->set_adapters(generation_config.adapters); + m_unet->set_adapters(generation_config.adapters); + if (generation_config.random_generator == nullptr) { uint32_t seed = time(NULL); generation_config.random_generator = std::make_shared(seed); diff --git a/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp new file mode 100644 index 0000000000..95ea2abc5d --- /dev/null +++ b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp @@ -0,0 +1,345 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/diffusion_pipeline.hpp" + +#include +#include + +#include "utils.hpp" + +namespace ov { +namespace genai { + +class Text2ImagePipeline::StableDiffusionXLPipeline : public Text2ImagePipeline::DiffusionPipeline { +public: + explicit StableDiffusionXLPipeline(const std::string& root_dir) { + const std::string model_index_path = root_dir + "/model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get(); + if (text_encoder == "CLIPTextModel") { + m_clip_text_encoder = std::make_shared(root_dir + "/text_encoder"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string text_encoder_2 = data["text_encoder_2"][1].get(); + if (text_encoder_2 == "CLIPTextModelWithProjection") { + m_clip_text_encoder_with_projection = std::make_shared(root_dir + "/text_encoder_2"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string unet = data["unet"][1].get(); + if (unet == "UNet2DConditionModel") { + m_unet = std::make_shared(root_dir + "/unet"); + } else { + OPENVINO_THROW("Unsupported '", unet, "' UNet type"); + } + + const std::string vae = data["vae"][1].get(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared(root_dir + "/vae_decoder"); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get()); + } + + StableDiffusionXLPipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { + const std::string model_index_path = root_dir + "/model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get(); + if (text_encoder == "CLIPTextModel") { + m_clip_text_encoder = std::make_shared(root_dir + "/text_encoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string text_encoder_2 = data["text_encoder_2"][1].get(); + if (text_encoder_2 == "CLIPTextModelWithProjection") { + m_clip_text_encoder_with_projection = std::make_shared(root_dir + "/text_encoder_2", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string unet = data["unet"][1].get(); + if (unet == "UNet2DConditionModel") { + m_unet = std::make_shared(root_dir + "/unet", device, properties); + } else { + OPENVINO_THROW("Unsupported '", unet, "' UNet type"); + } + + const std::string vae = data["vae"][1].get(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared(root_dir + "/vae_decoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get()); + } + + StableDiffusionXLPipeline( + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) + : m_clip_text_encoder(std::make_shared(clip_text_model)), + m_clip_text_encoder_with_projection(std::make_shared(clip_text_model_with_projection)), + m_unet(std::make_shared(unet)), + m_vae_decoder(std::make_shared(vae_decoder)) { } + + void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) override { + check_inputs(height, width); + + const size_t batch_size_multiplier = do_classifier_free_guidance(guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + m_clip_text_encoder->reshape(batch_size_multiplier); + m_clip_text_encoder_with_projection->reshape(batch_size_multiplier); + m_unet->reshape(num_images_per_prompt * batch_size_multiplier, height, width, m_clip_text_encoder->get_config().max_position_embeddings); + m_vae_decoder->reshape(num_images_per_prompt, height, width); + } + + void compile(const std::string& device, const ov::AnyMap& properties) override { + m_clip_text_encoder->compile(device, properties); + m_clip_text_encoder_with_projection->compile(device, properties); + m_unet->compile(device, properties); + m_vae_decoder->compile(device, properties); + } + + ov::Tensor generate(const std::string& positive_prompt, + const ov::AnyMap& properties) override { + GenerationConfig generation_config = m_generation_config; + generation_config.update_generation_config(properties); + + // Stable Diffusion pipeline + // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline + + const auto& unet_config = m_unet->get_config(); + const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + + if (generation_config.height < 0) + generation_config.height = unet_config.sample_size * vae_scale_factor; + if (generation_config.width < 0) + generation_config.width = unet_config.sample_size * vae_scale_factor; + check_inputs(generation_config.height, generation_config.width); + + if (generation_config.random_generator == nullptr) { + uint32_t seed = time(NULL); + generation_config.random_generator = std::make_shared(seed); + } + + std::vector time_ids = {static_cast(generation_config.width), + static_cast(generation_config.height), + 0, + 0, + static_cast(generation_config.width), + static_cast(generation_config.height), + }; + ov::Tensor add_time_ids(ov::element::f32, {batch_size_multiplier, time_ids.size()}); + float* add_time_ids_data = add_time_ids.data(); + std::copy(time_ids.begin(), time_ids.end(), add_time_ids_data); + + if (batch_size_multiplier > 1) { + std::copy(time_ids.begin(), time_ids.end(), add_time_ids_data + time_ids.size()); + } + + ov::Tensor add_text_embeds = m_clip_text_encoder_with_projection->infer(positive_prompt, generation_config.negative_prompt, batch_size_multiplier > 1); + m_clip_text_encoder->infer(positive_prompt, generation_config.negative_prompt, batch_size_multiplier > 1); + + // prompt_embeds = prompt_embeds.hidden_states[-2] + size_t idx_hidden_state_1 = m_clip_text_encoder->get_config().num_hidden_layers; + ov::Tensor encoder_hidden_states_1 = m_clip_text_encoder->get_output_tensor(idx_hidden_state_1); + size_t idx_hidden_state_2 = m_clip_text_encoder_with_projection->get_config().num_hidden_layers; + ov::Tensor encoder_hidden_states_2 = m_clip_text_encoder_with_projection->get_output_tensor(idx_hidden_state_2); + + ov::Shape ehs_1_shape = encoder_hidden_states_1.get_shape(); + ov::Shape ehs_2_shape = encoder_hidden_states_2.get_shape(); + + OPENVINO_ASSERT(ehs_1_shape[0] == ehs_2_shape[0] && ehs_1_shape[1] == ehs_2_shape[1], + "Tensors for concatenation must have the same dimensions"); + + // concatenate hidden_states from two encoders + ov::Shape encoder_hidden_states_shape = {ehs_1_shape[0], ehs_1_shape[1], ehs_1_shape[2] + ehs_2_shape[2]}; + ov::Tensor encoder_hidden_states(encoder_hidden_states_1.get_element_type(), encoder_hidden_states_shape); + + const float* ehs_1_data = encoder_hidden_states_1.data(); + const float* ehs_2_data = encoder_hidden_states_2.data(); + float* encoder_hidden_states_data = encoder_hidden_states.data(); + + for (size_t i = 0; i < ehs_1_shape[0]; ++i) { + for (size_t j = 0; j < ehs_1_shape[1]; ++j) { + size_t offset_1 = (i * ehs_1_shape[1] + j) * ehs_1_shape[2]; + size_t offset_2 = (i * ehs_2_shape[1] + j) * ehs_2_shape[2]; + + size_t step = (i * ehs_1_shape[1] + j) * (ehs_1_shape[2] + ehs_2_shape[2]); + + std::memcpy(encoder_hidden_states_data + step, ehs_1_data + offset_1, ehs_1_shape[2] * sizeof(float)); + std::memcpy(encoder_hidden_states_data + step + ehs_1_shape[2], ehs_2_data + offset_2, ehs_2_shape[2] * sizeof(float)); + } + } + + // replicate encoder hidden state to UNet model + if (generation_config.num_images_per_prompt == 1) { + // reuse output of text encoder directly w/o extra memory copy + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states); + m_unet->set_hidden_states("text_embeds", add_text_embeds); + m_unet->set_hidden_states("time_ids", add_time_ids); + + } else { + ov::Shape enc_shape = encoder_hidden_states.get_shape(); + enc_shape[0] *= generation_config.num_images_per_prompt; + + ov::Tensor encoder_hidden_states_repeated(encoder_hidden_states.get_element_type(), enc_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states_repeated); + + ov::Shape t_emb_shape = add_text_embeds.get_shape(); + t_emb_shape[0] *= generation_config.num_images_per_prompt; + + ov::Tensor add_text_embeds_repeated(add_text_embeds.get_element_type(), t_emb_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(add_text_embeds, add_text_embeds_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(add_text_embeds, add_text_embeds_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("text_embeds", add_text_embeds_repeated); + + ov::Shape t_ids_shape = add_time_ids.get_shape(); + t_ids_shape[0] *= generation_config.num_images_per_prompt; + ov::Tensor add_time_ids_repeated(add_time_ids.get_element_type(), t_ids_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(add_time_ids, add_time_ids_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(add_time_ids, add_time_ids_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("time_ids", add_time_ids_repeated); + } + + m_scheduler->set_timesteps(generation_config.num_inference_steps); + std::vector timesteps = m_scheduler->get_timesteps(); + + // latents are multiplied by 'init_noise_sigma' + ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, + generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; + ov::Shape latent_shape_cfg = latent_shape; + latent_shape_cfg[0] *= batch_size_multiplier; + + ov::Tensor latent(ov::element::f32, latent_shape), latent_cfg(ov::element::f32, latent_shape_cfg); + std::generate_n(latent.data(), latent.get_size(), [&]() -> float { + return generation_config.random_generator->next() * m_scheduler->get_init_noise_sigma(); + }); + + ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); + for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; inference_step++) { + // concat the same latent twice along a batch dimension in case of CFG + if (batch_size_multiplier > 1) { + batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); + batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); + } else { + // just assign to save memory copy + latent_cfg = latent; + } + + m_scheduler->scale_model_input(latent_cfg, inference_step); + + ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); + ov::Tensor noise_pred_tensor = m_unet->infer(latent_cfg, timestep); + + ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); + noise_pred_shape[0] /= batch_size_multiplier; + noisy_residual_tensor.set_shape(noise_pred_shape); + + if (batch_size_multiplier > 1) { + // perform guidance + float* noisy_residual = noisy_residual_tensor.data(); + const float* noise_pred_uncond = noise_pred_tensor.data(); + const float* noise_pred_text = noise_pred_uncond + noisy_residual_tensor.get_size(); + + for (size_t i = 0; i < noisy_residual_tensor.get_size(); ++i) { + noisy_residual[i] = noise_pred_uncond[i] + + generation_config.guidance_scale * (noise_pred_text[i] - noise_pred_uncond[i]); + } + } else { + noisy_residual_tensor = noise_pred_tensor; + } + + auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step); + latent = scheduler_step_result["latent"]; + + // check whether scheduler returns "denoised" image, which should be passed to VAE decoder + const auto it = scheduler_step_result.find("denoised"); + denoised = it != scheduler_step_result.end() ? it->second : latent; + } + + return m_vae_decoder->infer(denoised); + } + +private: + bool do_classifier_free_guidance(float guidance_scale) const { + return guidance_scale > 1.0 && m_unet->get_config().time_cond_proj_dim < 0; + } + + void initialize_generation_config(const std::string& class_name) override { + assert(m_unet != nullptr); + const auto& unet_config = m_unet->get_config(); + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + + m_generation_config.height = unet_config.sample_size * vae_scale_factor; + m_generation_config.width = unet_config.sample_size * vae_scale_factor; + + if (class_name == "StableDiffusionXLPipeline") { + m_generation_config.guidance_scale = 5.0f; + m_generation_config.num_inference_steps = 50; + } else { + OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers"); + } + } + + void check_inputs(const int height, const int width) const override { + assert(m_unet != nullptr); + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && + (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by", + vae_scale_factor); + } + + std::shared_ptr m_clip_text_encoder; + std::shared_ptr m_clip_text_encoder_with_projection; + std::shared_ptr m_unet; + std::shared_ptr m_vae_decoder; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/text2image_pipeline.cpp b/src/cpp/src/text2image/text2image_pipeline.cpp index b8a8e1898b..f7a6ab65ae 100644 --- a/src/cpp/src/text2image/text2image_pipeline.cpp +++ b/src/cpp/src/text2image/text2image_pipeline.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "text2image/stable_diffusion_pipeline.hpp" +#include "text2image/stable_diffusion_xl_pipeline.hpp" #include #include @@ -44,7 +45,8 @@ void Text2ImagePipeline::GenerationConfig::update_generation_config(const ov::An read_anymap_param(properties, "height", height); read_anymap_param(properties, "width", width); read_anymap_param(properties, "num_inference_steps", num_inference_steps); - + read_anymap_param(properties, "adapters", adapters); + validate(); } @@ -60,8 +62,10 @@ Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir) { const std::string class_name = get_class_name(root_dir); if (class_name == "StableDiffusionPipeline" || - class_name == "LatentConsistencyModelPipeline") { + class_name == "LatentConsistencyModelPipeline") { m_impl = std::make_shared(root_dir); + } else if (class_name == "StableDiffusionXLPipeline") { + m_impl = std::make_shared(root_dir); } else { OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); } @@ -70,15 +74,17 @@ Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir) { Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { const std::string class_name = get_class_name(root_dir); - if (class_name == "StableDiffusionPipeline" || + if (class_name == "StableDiffusionPipeline" || class_name == "LatentConsistencyModelPipeline") { m_impl = std::make_shared(root_dir, device, properties); + } else if (class_name == "StableDiffusionXLPipeline") { + m_impl = std::make_shared(root_dir, device, properties); } else { OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); } } -Text2ImagePipeline::Text2ImagePipeline(const std::shared_ptr& impl) +Text2ImagePipeline::Text2ImagePipeline(const std::shared_ptr& impl) : m_impl(impl) { assert(m_impl != nullptr); } @@ -104,6 +110,20 @@ Text2ImagePipeline Text2ImagePipeline::latent_consistency_model( return stable_diffusion(scheduler, clip_text_model, unet, vae_decoder); } +Text2ImagePipeline Text2ImagePipeline::stable_diffusion_xl( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) { + auto impl = std::make_shared(clip_text_model, clip_text_model_with_projection, unet, vae_decoder); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Text2ImagePipeline(impl); +} + Text2ImagePipeline::GenerationConfig Text2ImagePipeline::get_generation_config() const { return m_impl->get_generation_config(); } diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp index 465f5a9780..05539b67dc 100644 --- a/src/cpp/src/vision_encoder.cpp +++ b/src/cpp/src/vision_encoder.cpp @@ -216,6 +216,65 @@ ov::Tensor preprocess_for_encoder(const ov::Tensor& images, size_t kernel) { return permuted_tensor; } +// torch.bucketize(fractional_coords, boundaries, right=True) +std::vector bucket_size_right(const std::vector& fractional_coords, const std::vector& boundaries) { + std::vector bucket_coords(fractional_coords.size()); + std::transform(fractional_coords.begin(), fractional_coords.end(), bucket_coords.begin(), [&boundaries](float fractional_coord) { + return std::distance(boundaries.begin(), std::upper_bound(boundaries.begin(), boundaries.end(), fractional_coord)); + }); + return bucket_coords; +} + +ov::Tensor prepare_vis_position_ids( + const ov::Tensor& pixel_values, + const ov::Tensor& patch_attention_mask, + const std::vector tgt_sizes, + size_t patch_size, + size_t num_patches_per_side +) { + size_t batch_size = pixel_values.get_shape().at(0); + size_t max_im_h = pixel_values.get_shape().at(2), max_im_w = pixel_values.get_shape().at(3); + size_t max_nb_patches_h = max_im_h / patch_size, max_nb_patches_w = max_im_w / patch_size; + std::vector boundaries(1.0f * num_patches_per_side - 1); + std::generate(boundaries.begin(), boundaries.end(), [num_patches_per_side, val = 0.0f]() mutable { + val += 1.0f / num_patches_per_side; + return val; + }); + size_t position_ids_batch_elem = max_nb_patches_h * max_nb_patches_w; + ov::Tensor position_ids{ov::element::i64, {batch_size, position_ids_batch_elem}}; + // throw std::runtime_error(""); + int64_t* res_data = position_ids.data(); + std::fill_n(res_data, position_ids.get_size(), 0); + + for (size_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { + size_t nb_patches_h = tgt_sizes.at(batch_idx).height; + size_t nb_patches_w = tgt_sizes.at(batch_idx).width; + + std::vector fractional_coords_h(nb_patches_h); + std::generate(fractional_coords_h.begin(), fractional_coords_h.end(), [nb_patches_h, val = -1.0f / nb_patches_h]() mutable { + val += 1.0f / nb_patches_h; + return val; + }); + std::vector fractional_coords_w(nb_patches_w); + std::generate(fractional_coords_w.begin(), fractional_coords_w.end(), [nb_patches_w, val = -1.0f / nb_patches_w]() mutable { + val += 1.0f / nb_patches_w; + return val; + }); + + std::vector bucket_coords_h = bucket_size_right(fractional_coords_h, boundaries); + std::vector bucket_coords_w = bucket_size_right(fractional_coords_w, boundaries); + + std::vector pos_ids(bucket_coords_h.size() * bucket_coords_w.size()); + for (size_t col = 0; col < bucket_coords_h.size(); ++col) { + for (size_t row = 0; row < bucket_coords_w.size(); ++row) {; + pos_ids.at(col * bucket_coords_w.size() + row) = bucket_coords_h.at(col) * num_patches_per_side + bucket_coords_w.at(row); + } + } + std::copy(pos_ids.begin(), pos_ids.end(), res_data + batch_idx * position_ids_batch_elem); + } + return position_ids; +} + EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) { clip_image_u8 source{ int(img.get_shape().at(3)), @@ -244,13 +303,12 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o ov::Tensor patch_attention_mask{ov::element::boolean, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}}; std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), true); encoder.set_tensor("patch_attention_mask", patch_attention_mask); - ov::Tensor tgt_sizes{ov::element::i64, {1, 2}}; - int64_t* tgt_sizes_data = tgt_sizes.data(); - tgt_sizes_data[0] = resized_source_size.height; - tgt_sizes_data[1] = resized_source_size.width; - encoder.set_tensor("tgt_sizes", tgt_sizes); + ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); + encoder.set_tensor("position_ids", position_ids); encoder.infer(); - const ov::Tensor& resized_source = encoder.get_output_tensor(); + const ov::Tensor& output_tensor = encoder.get_output_tensor(); + ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape()}; + output_tensor.copy_to(resized_source); if (1 == preprocessed.size()) { return {std::move(resized_source), resized_source_size}; @@ -268,24 +326,22 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o for (size_t col = 0; col < preprocessed.at(row).size(); ++col) { clip_image_f32& elem = preprocessed.at(row).at(col); sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size}); - encoder.set_tensor("pixel_values", preprocess_for_encoder( + ov::Tensor pixel_values = preprocess_for_encoder( {ov::element::f32, {1, 3, size_t(elem.ny), size_t(elem.nx)}, elem.buf.data()}, patch_size - )); + ); + encoder.set_tensor("pixel_values", pixel_values); ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, sliced_sizes.back().height * sliced_sizes.back().width}}; std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), true); encoder.set_tensor("patch_attention_mask", patch_attention_mask); - ov::Tensor tgt_sizes{ov::element::i64, {1, 2}}; - int64_t* tgt_sizes_data = tgt_sizes.data(); - tgt_sizes_data[0] = sliced_sizes.back().height; - tgt_sizes_data[1] = sliced_sizes.back().width; - encoder.set_tensor("tgt_sizes", tgt_sizes); + ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {sliced_sizes.back()}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); + encoder.set_tensor("position_ids", position_ids); + const ov::Tensor& old = encoder.get_output_tensor(); encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size}); encoder.infer(); + encoder.set_output_tensor(old); } } - // Override prev output tensor that doesn't own memory. - encoder.set_output_tensor(resized_source); return {resized_source, resized_source_size, encoded_slices, sliced_sizes}; } } @@ -302,6 +358,8 @@ VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std:: EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) { clip_ctx ctx_clip; + ctx_clip.patch_size = m_processor_config.patch_size; + ctx_clip.image_size = m_processor_config.image_size; std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean); std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std); return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums); diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index 66a65ef4cb..99c38c976d 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -338,12 +338,11 @@ DecodedResults VLMPipeline::generate( const StreamerVariant& streamer ) { std::string images_prompt; - EncodedImage embeds; - if (!rgbs.empty()) { - OPENVINO_ASSERT(1 == rgbs.size(), "TODO: Only a single image allowed"); - embeds = m_vision_encoder.encode(rgbs.at(0)); + std::vector embeds; + for (const ov::Tensor& rgb : rgbs) { + EncodedImage encoded_image = m_vision_encoder.encode(rgb); if (m_vlm_config.use_image_id) { - images_prompt = m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end; + images_prompt += m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end; ++image_id; } std::string unk64; @@ -351,8 +350,8 @@ DecodedResults VLMPipeline::generate( unk64 += m_vlm_config.unk; } images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; - if (embeds.slices) { - ov::Shape slices_shape = embeds.slices.get_shape(); + if (encoded_image.slices) { + ov::Shape slices_shape = encoded_image.slices.get_shape(); for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; @@ -365,6 +364,7 @@ DecodedResults VLMPipeline::generate( // Strangely, \n isn't placed between . images_prompt += '\n'; } + embeds.push_back(std::move(encoded_image)); } images_prompt += prompt; ov::Tensor encoded_input; @@ -402,61 +402,47 @@ DecodedResults VLMPipeline::generate( m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), "Unexpected embedding size" ); - if (!rgbs.empty()) { - ov::Tensor special_tokens = m_tokenizer.encode( - m_vlm_config.im_start - + m_vlm_config.im_end - + m_vlm_config.slice_start - + m_vlm_config.slice_end - ).input_ids; - OPENVINO_ASSERT( - 4 == special_tokens.get_shape().at(1), - "Every special token must be represented with a single int." - ); - size_t im_start_id = special_tokens.data()[0]; - size_t im_end_id = special_tokens.data()[1]; - size_t slice_start_id = special_tokens.data()[2]; - size_t slice_end_id = special_tokens.data()[3]; - int64_t* ids = encoded_input.data(); - const ov::Tensor& resampled_source = resample(*this, embeds.resized_source, {embeds.resized_source_size}); + ov::Tensor special_tokens = m_tokenizer.encode( + m_vlm_config.im_start + + m_vlm_config.im_end + + m_vlm_config.slice_start + + m_vlm_config.slice_end + ).input_ids; + OPENVINO_ASSERT( + 4 == special_tokens.get_shape().at(1), + "Every special token must be represented with a single int." + ); + int64_t im_start_id = special_tokens.data()[0]; + int64_t im_end_id = special_tokens.data()[1]; + int64_t slice_start_id = special_tokens.data()[2]; + int64_t slice_end_id = special_tokens.data()[3]; + int64_t im_start_pos = 0, slice_start_pos = 0; + int64_t* begin = encoded_input.data(); + int64_t* ids = begin; + size_t encoded_input_size = encoded_input.get_size(); + int64_t* end = ids + encoded_input_size; + float* inputs_embeds_data = inputs_embeds.data(); + for (const EncodedImage& encoded_image : embeds) { + const ov::Tensor& resampled_source = resample(*this, encoded_image.resized_source, {encoded_image.resized_source_size}); float* emb = resampled_source.data(); - bool replacing = false; - for (size_t token_idx = 0; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) { - if (im_start_id == ids[token_idx]) { - replacing = true; - } - if (replacing) { - std::copy_n(emb, resampled_source.get_size(), inputs_embeds.data() + token_idx * m_vlm_config.hidden_size); - token_idx += resampled_source.get_shape().at(1); - replacing = false; - break; - } - } - if (embeds.slices) { + ids = std::find(ids, end, im_start_id); + OPENVINO_ASSERT(end != ids); + std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + ids += m_vlm_config.query_num; + if (encoded_image.slices) { size_t token_idx = 0; - const ov::Shape& slices_shape = embeds.slices.get_shape(); - const std::vector& sliced_sizes = embeds.slices_sizes; + const ov::Shape& slices_shape = encoded_image.slices.get_shape(); + const std::vector& sliced_sizes = encoded_image.slices_sizes; for (size_t i = 0; i < slices_shape.at(0); ++i) { for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { size_t d2 = slices_shape.at(2); size_t d3 = slices_shape.at(3); - ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, embeds.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; + ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {sliced_sizes.at(i * slices_shape.at(1) + ja)}); - for (; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) { - if (slice_start_id == ids[token_idx]) { - replacing = true; - } - if (slice_end_id == ids[token_idx]) { - replacing = false; - break; - } - if (replacing) { - std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds.data() + token_idx * m_vlm_config.hidden_size); - token_idx += vision_embed_tensor_i_j.get_shape().at(1); - replacing = false; - break; - } - } + ids = std::find(ids, end, slice_start_id); + OPENVINO_ASSERT(end != ids); + std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + ids += m_vlm_config.query_num; } } } @@ -552,13 +538,23 @@ DecodedResults VLMPipeline::generate( const ov::AnyMap& config_map ) { auto image = config_map.find(ov::genai::image.name()); + auto images = config_map.find(ov::genai::images.name()); + OPENVINO_ASSERT( + config_map.end() == image || config_map.end() == images, + "Only one property can be set: image of images." + ); + std::vector rgbs; + if (config_map.end() != image) { + rgbs = {image->second.as()}; + } if (config_map.end() != images) { + rgbs = images->second.as>(); + } ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); config.update_generation_config(config_map); return generate( prompt, - config_map.end() == image ? std::vector{} - : std::vector{image->second.as()}, + rgbs, config, utils::get_streamer_from_map(config_map) ); diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md index 79d6ce861a..77657620a0 100644 --- a/src/docs/BUILD.md +++ b/src/docs/BUILD.md @@ -43,11 +43,11 @@ OpenVINO GenAI can be built as an extra module during the OpenVINO build process 1. Clone OpenVINO and OpenVINO GenAI repositories: ```sh git clone --recursive https://github.com/openvinotoolkit/openvino.git - git clone --recursive https://github.com/openvinotoolkit/openvino_genai.git + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git ``` 2. Configure CMake with OpenVINO extra modules: ```sh - cmake -DOPENVINO_EXTRA_MODULES=./openvino_genai -DCPACK_ARCHIVE_COMPONENT_INSTALL=OFF -S ./openvino -B ./build + cmake -DOPENVINO_EXTRA_MODULES=./openvino.genai -DCPACK_ARCHIVE_COMPONENT_INSTALL=OFF -S ./openvino -B ./build ``` 3. Build OpenVINO archive with GenAI: ```sh diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index c032ca1a55..bf76f34f4f 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -18,7 +18,7 @@ if(NOT pybind11_POPULATED) add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR}) endif() -pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp py_whisper_pipeline.cpp utils.cpp) +pybind11_add_module(py_generate_pipeline py_vlm_pipeline.cpp py_generate_pipeline.cpp py_whisper_pipeline.cpp utils.cpp) target_link_libraries(py_generate_pipeline PRIVATE openvino::genai) set_target_properties(py_generate_pipeline PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index c4d219fcf4..879dfc8262 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -17,7 +17,8 @@ EncodedResults, GenerationConfig, GenerationResult, - LLMPipeline, + LLMPipeline, + VLMPipeline, PerfMetrics, RawPerfMetrics, SchedulerConfig, diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 74b704f34b..b636253e33 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -50,6 +50,7 @@ std::vector get_ms(const T& instance, U T::*member) { } void init_whisper_pipeline(py::module_& m); +void init_vlm_pipeline(py::module_& m); namespace { @@ -310,68 +311,6 @@ auto cache_eviction_config_docstring = R"( :type aggregation_mode: openvino_genai.AggregationMode )"; -OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& config, const py::kwargs& kwargs) { - if(!config.has_value() && kwargs.empty()) - return std::nullopt; - - GenerationConfig res_config; - if(config.has_value()) - res_config = *config; - - for (const auto& item : kwargs) { - std::string key = py::cast(item.first); - py::object value = py::cast(item.second); - - if (item.second.is_none()) { - // Even if argument key name does not fit GenerationConfig name - // it's not an eror if it's not defined. - // Some HF configs can have parameters for methods currenly unsupported in ov_genai - // but if their values are not set / None, then this should not block - // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...} - return res_config; - } - - if (key == "max_new_tokens") { - res_config.max_new_tokens = py::cast(item.second); - } else if (key == "max_length") { - res_config.max_length = py::cast(item.second); - } else if (key == "ignore_eos") { - res_config.ignore_eos = py::cast(item.second); - } else if (key == "num_beam_groups") { - res_config.num_beam_groups = py::cast(item.second); - } else if (key == "num_beams") { - res_config.num_beams = py::cast(item.second); - } else if (key == "diversity_penalty") { - res_config.diversity_penalty = py::cast(item.second); - } else if (key == "length_penalty") { - res_config.length_penalty = py::cast(item.second); - } else if (key == "num_return_sequences") { - res_config.num_return_sequences = py::cast(item.second); - } else if (key == "no_repeat_ngram_size") { - res_config.no_repeat_ngram_size = py::cast(item.second); - } else if (key == "stop_criteria") { - res_config.stop_criteria = py::cast(item.second); - } else if (key == "temperature") { - res_config.temperature = py::cast(item.second); - } else if (key == "top_p") { - res_config.top_p = py::cast(item.second); - } else if (key == "top_k") { - res_config.top_k = py::cast(item.second); - } else if (key == "do_sample") { - res_config.do_sample = py::cast(item.second); - } else if (key == "repetition_penalty") { - res_config.repetition_penalty = py::cast(item.second); - } else if (key == "eos_token_id") { - res_config.set_eos_token_id(py::cast(item.second)); - } else { - throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. " - "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters.")); - } - } - - return res_config; -} - py::list handle_utf8_results(const std::vector& decoded_res) { // pybind11 decodes strings similar to Pythons's // bytes.decode('utf-8'). It raises if the decoding fails. @@ -392,26 +331,10 @@ py::object call_common_generate( const utils::PyBindStreamerVariant& py_streamer, const py::kwargs& kwargs ) { - auto updated_config = update_config_from_kwargs(config, kwargs); + auto updated_config = ov::genai::pybind::utils::update_config_from_kwargs(config, kwargs); py::object results; EncodedInputs tensor_data; - StreamerVariant streamer = std::monostate(); - - std::visit(utils::overloaded { - [&streamer](const std::function& py_callback){ - // Wrap python streamer with manual utf-8 decoding. Do not rely - // on pybind automatic decoding since it raises exceptions on incomplete strings. - auto callback_wrapped = [&py_callback](std::string subword) -> bool { - auto py_str = PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); - return py_callback(py::reinterpret_borrow(py_str)); - }; - streamer = callback_wrapped; - }, - [&streamer](std::shared_ptr streamer_cls){ - streamer = streamer_cls; - }, - [](std::monostate none){ /*streamer is already a monostate */ } - }, py_streamer); + StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); // Call suitable generate overload for each type of input. std::visit(utils::overloaded { @@ -635,7 +558,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // Binding for GenerationConfig py::class_(m, "GenerationConfig", generation_config_docstring) .def(py::init(), py::arg("json_path"), "path where generation_config.json is stored") - .def(py::init([](py::kwargs kwargs) { return *update_config_from_kwargs(GenerationConfig(), kwargs); })) + .def(py::init([](py::kwargs kwargs) { return *ov::genai::pybind::utils::update_config_from_kwargs(GenerationConfig(), kwargs); })) .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) .def_readwrite("max_length", &GenerationConfig::max_length) .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) @@ -840,4 +763,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // init whisper bindings init_whisper_pipeline(m); + + // init vlm pipeline + init_vlm_pipeline(m); } diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp new file mode 100644 index 0000000000..8d6a5f0077 --- /dev/null +++ b/src/python/py_vlm_pipeline.cpp @@ -0,0 +1,199 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + + +#include +#include +#include +#include +#include +#include "openvino/genai/vlm_pipeline.hpp" +#include "../cpp/src/tokenizers_path.hpp" +#include "./utils.hpp" + +namespace py = pybind11; +namespace utils = ov::genai::pybind::utils; + + +auto vlm_generate_docstring = R"( + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + + :param images: list of images + :type inputs: List[ov.Tensor] + + :param generation_config: generation_config + :type generation_config: GenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. + :type : Dict + + :return: return results in decoded form + :rtype: DecodedResults +)"; + +auto vlm_generate_kwargs_docstring = R"( + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. + + Expected parameters list: + image: ov.Tensor - input image, + images: List[ov.Tensor] - input images, + generation_config: GenerationConfig, + streamer: Callable[[str], bool], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped + + :return: return results in decoded form + :rtype: DecodedResults +)"; + +py::object call_vlm_generate( + ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const std::vector& images, + const ov::genai::GenerationConfig& generation_config, + const utils::PyBindStreamerVariant& py_streamer, + const py::kwargs& kwargs +) { + auto updated_config = *ov::genai::pybind::utils::update_config_from_kwargs(generation_config, kwargs); + ov::genai::StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); + + return py::cast(pipe.generate(prompt, images, updated_config, streamer)); +} + +py::object call_vlm_generate( + ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const py::kwargs& kwargs +) { + ov::AnyMap params = {}; + + for (const auto& item : kwargs) { + std::string key = py::cast(item.first); + py::object value = py::cast(item.second); + + if (key == "images") { + params.insert({ov::genai::images(std::move(py::cast>(item.second)))}); + } else if (key == "image") { + params.insert({ov::genai::image(std::move(py::cast(item.second)))}); + } else if (key == "generation_config") { + params.insert({ov::genai::generation_config(std::move(py::cast(item.second)))}); + } else if (key == "streamer") { + auto py_streamer = py::cast(value); + params.insert({ov::genai::streamer(std::move(ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer)))}); + + } else { + throw(std::invalid_argument("'" + key + "' is unexpected parameter name. " + "Use help(openvino_genai.VLMPipeline.generate) to get list of acceptable parameters.")); + } + } + + return py::cast(pipe.generate(prompt, params)); +} + +void init_vlm_pipeline(py::module_& m) { + py::class_(m, "VLMPipeline", "This class is used for generation with VLMs") + .def(py::init([]( + const std::string& model_path, + const std::string& device, + const std::map& config + ) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique(model_path, device, utils::properties_to_any_map(config)); + }), + py::arg("model_path"), "folder with exported model files", + py::arg("device") = "CPU", "device on which inference will be done", + py::arg("config") = ov::AnyMap({}), "openvino.properties map" + R"( + VLMPipeline class constructor. + model_path (str): Path to the folder with exported model files. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + + .def(py::init([]( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const std::map& config + ) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique(model_path, tokenizer, device, utils::properties_to_any_map(config)); + }), + py::arg("model_path"), + py::arg("tokenizer"), + py::arg("device") = "CPU", + py::arg("config") = ov::AnyMap({}), "openvino.properties map" + R"( + VLMPipeline class constructor for manualy created openvino_genai.Tokenizer. + model_path (str): Path to the folder with exported model files. + tokenizer (openvino_genai.Tokenizer): tokenizer object. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + .def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "") + .def("finish_chat", &ov::genai::VLMPipeline::finish_chat) + .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config) + .def( + "generate", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const std::vector& images, + const ov::genai::GenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); + }, + py::arg("prompt"), "Input string", + py::arg("images"), "Input images", + py::arg("generation_config") = std::nullopt, "generation_config", + py::arg("streamer") = std::monostate(), "streamer", + (vlm_generate_docstring + std::string(" \n ")).c_str() + ) + .def( + "generate", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, kwargs); + }, + py::arg("prompt"), "Input string", + (vlm_generate_kwargs_docstring + std::string(" \n ")).c_str() + ) + .def( + "__call__", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const std::vector& images, + const ov::genai::GenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); + }, + py::arg("prompt"), "Input string", + py::arg("images"), "Input images", + py::arg("generation_config") = std::nullopt, "generation_config", + py::arg("streamer") = std::monostate(), "streamer", + (vlm_generate_docstring + std::string(" \n ")).c_str() + ) + .def( + "__call__", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, kwargs); + }, + py::arg("prompt"), "Input string", + (vlm_generate_kwargs_docstring + std::string(" \n ")).c_str() + ); +} diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index b7c6756e89..5d354ef93c 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -170,23 +170,7 @@ py::object call_whisper_common_generate(WhisperPipeline& pipe, auto updated_config = update_whisper_config_from_kwargs(base_config, kwargs); - StreamerVariant streamer = std::monostate(); - - std::visit(utils::overloaded{[&streamer](const std::function& py_callback) { - // Wrap python streamer with manual utf-8 decoding. Do not rely - // on pybind automatic decoding since it raises exceptions on incomplete strings. - auto callback_wrapped = [&py_callback](std::string subword) -> bool { - auto py_str = - PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); - return py_callback(py::reinterpret_borrow(py_str)); - }; - streamer = callback_wrapped; - }, - [&streamer](std::shared_ptr streamer_cls) { - streamer = streamer_cls; - }, - [](std::monostate none) { /*streamer is already a monostate */ }}, - py_streamer); + StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); return py::cast(pipe.generate(raw_speech_input, updated_config, streamer)); } diff --git a/src/python/utils.cpp b/src/python/utils.cpp index bf8f195766..65033d0866 100644 --- a/src/python/utils.cpp +++ b/src/python/utils.cpp @@ -161,4 +161,87 @@ std::string ov_tokenizers_module_path() { return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path")); } +ov::genai::StreamerVariant pystreamer_to_streamer(const utils::PyBindStreamerVariant& py_streamer) { + ov::genai::StreamerVariant streamer = std::monostate(); + + std::visit(utils::overloaded { + [&streamer](const std::function& py_callback){ + // Wrap python streamer with manual utf-8 decoding. Do not rely + // on pybind automatic decoding since it raises exceptions on incomplete strings. + auto callback_wrapped = [py_callback](std::string subword) -> bool { + auto py_str = PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); + return py_callback(py::reinterpret_borrow(py_str)); + }; + streamer = callback_wrapped; + }, + [&streamer](std::shared_ptr streamer_cls){ + streamer = streamer_cls; + }, + [](std::monostate none){ /*streamer is already a monostate */ } + }, py_streamer); + return streamer; +} + +ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::OptionalGenerationConfig& config, const py::kwargs& kwargs) { + if(!config.has_value() && kwargs.empty()) + return std::nullopt; + + ov::genai::GenerationConfig res_config; + if(config.has_value()) + res_config = *config; + + for (const auto& item : kwargs) { + std::string key = py::cast(item.first); + py::object value = py::cast(item.second); + + if (item.second.is_none()) { + // Even if argument key name does not fit GenerationConfig name + // it's not an eror if it's not defined. + // Some HF configs can have parameters for methods currenly unsupported in ov_genai + // but if their values are not set / None, then this should not block + // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...} + return res_config; + } + + if (key == "max_new_tokens") { + res_config.max_new_tokens = py::cast(item.second); + } else if (key == "max_length") { + res_config.max_length = py::cast(item.second); + } else if (key == "ignore_eos") { + res_config.ignore_eos = py::cast(item.second); + } else if (key == "num_beam_groups") { + res_config.num_beam_groups = py::cast(item.second); + } else if (key == "num_beams") { + res_config.num_beams = py::cast(item.second); + } else if (key == "diversity_penalty") { + res_config.diversity_penalty = py::cast(item.second); + } else if (key == "length_penalty") { + res_config.length_penalty = py::cast(item.second); + } else if (key == "num_return_sequences") { + res_config.num_return_sequences = py::cast(item.second); + } else if (key == "no_repeat_ngram_size") { + res_config.no_repeat_ngram_size = py::cast(item.second); + } else if (key == "stop_criteria") { + res_config.stop_criteria = py::cast(item.second); + } else if (key == "temperature") { + res_config.temperature = py::cast(item.second); + } else if (key == "top_p") { + res_config.top_p = py::cast(item.second); + } else if (key == "top_k") { + res_config.top_k = py::cast(item.second); + } else if (key == "do_sample") { + res_config.do_sample = py::cast(item.second); + } else if (key == "repetition_penalty") { + res_config.repetition_penalty = py::cast(item.second); + } else if (key == "eos_token_id") { + res_config.set_eos_token_id(py::cast(item.second)); + } else { + throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. " + "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters.")); + } + } + + return res_config; +} + } // namespace ov::genai::pybind::utils diff --git a/src/python/utils.hpp b/src/python/utils.hpp index 0a18a9c5f9..4047bdcfe7 100644 --- a/src/python/utils.hpp +++ b/src/python/utils.hpp @@ -6,6 +6,7 @@ #include #include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/llm_pipeline.hpp" namespace py = pybind11; using ov::genai::StreamerBase; @@ -33,4 +34,8 @@ std::map properties_to_any_map(const std::map Dict[str, List[str]]: file_path = TESTS_ROOT / 'data' / file_name with open(file_path, 'r') as f: - return {"questions": [s for s in f]} + return {"prompts": [s for s in f]} def get_scheduler_config(num_kv_blocks: int) -> SchedulerConfig: scheduler_config = SchedulerConfig() @@ -118,7 +118,7 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t data_dict = load_prompts_dataset(test_struct.prompt_file) - evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict, + evaluator = whowhatbench.TextEvaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict, generation_config=generation_config, generation_config_base=generation_config, max_new_tokens=test_struct.max_new_tokens, seqs_per_request=seqs_per_request)