diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 7b0c7df454..6ceb2f162d 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -14,6 +14,7 @@ concurrency: env: l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz + l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu22_2024.5.0.dev20240830_x86_64.tgz m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/m_openvino_toolkit_macos_12_6_2024.5.0.dev20240830_x86_64.tgz w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip jobs: @@ -202,8 +203,7 @@ jobs: echo "Multi prompt" passed cpp-greedy_causal_lm-windows: - runs-on: windows-latest - if: ${{ false }} # TODO: fix Windows + runs-on: windows-2019-16-core env: PYTHONIOENCODING: "utf8" defaults: @@ -218,6 +218,8 @@ jobs: python-version: 3.9 - name: Configure Developer Command Prompt for Microsoft Visual C++ uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + toolset: 14.29 - run: curl --output ov.zip ${{ env.w_ov_link }} - run: unzip -d ov ov.zip - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" @@ -681,42 +683,38 @@ jobs: diff pred2.txt ref.txt echo "Chat sample python" passed - visual_language_sample: - strategy: - fail-fast: false - # Windows fails to compile Jinja2Cpp. - matrix: {runs-on: [ubuntu-20.04-16-cores, macos-13]} - runs-on: ${{ matrix.runs-on }} + visual_language_chat_sample-ubuntu: + runs-on: ubuntu-22.04-16-cores steps: - uses: actions/checkout@v4 - with: {submodules: recursive} + with: + submodules: recursive - uses: actions/setup-python@v4 - with: {python-version: 3.12} - - run: mkdir ./ov/ - - if: ${{ 'ubuntu-20.04-16-cores' == matrix.runs-on }} - run: > - curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz - && sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - if: ${{ 'macos-13' == matrix.runs-on }} - run: > - curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz - && brew install coreutils scons - - run: OpenVINO_DIR=./ov/runtime/cmake/ cmake -DCMAKE_BUILD_TYPE=Release -B ./build/ ./ - - run: > - LD_LIBRARY_PATH=${{ github.workspace }}/ov/runtime/3rdparty/tbb/lib/:$LD_LIBRARY_PATH - cmake --build ./build/ --config Release --target visual_language_chat -j - - run: > + with: + python-version: 3.11 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_u22_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | source ./ov/setupvars.sh - && python -m pip install --upgrade-strategy eager ./thirdparty/openvino_tokenizers/[transformers] -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - - run: > + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --target visual_language_chat -j + - name: Download and convert a model and an image + run: | source ./ov/setupvars.sh - && python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/ - - run: wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 - - run: > + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/ + wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg + + - name: Run chat chat sample + run: > source ./ov/setupvars.sh - && ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11 + && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg <<< $'What is on the image?\nWhat is special on the image?' - timeout-minutes: 110 cpp-continuous-batching-ubuntu: runs-on: ubuntu-20.04-8-cores @@ -762,8 +760,7 @@ jobs: timeout 200s ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 cpp-continuous-batching-windows: - runs-on: windows-latest - if: ${{ false }} # TODO: fix Windows + runs-on: windows-2019-16-core env: PYTHONIOENCODING: "utf8" defaults: @@ -778,6 +775,8 @@ jobs: python-version: 3.9 - name: Configure Developer Command Prompt for Microsoft Visual C++ uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + toolset: 14.29 - name: Install OpenVINO run: | curl --output ov.zip ${{ env.w_ov_link }} @@ -860,7 +859,7 @@ jobs: cpp-beam_search_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2, cpp-beam_search_causal_lm-notus-7b-v1, cpp-speculative_decoding_lm-ubuntu, cpp-prompt_lookup_decoding_lm-ubuntu, cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu, - visual_language_sample, + visual_language_chat_sample-ubuntu, cpp-continuous-batching-windows, cpp-continuous-batching-macos] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index cd31ae497d..a5c057a0ab 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -67,11 +67,10 @@ jobs: - name: Run app run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - ./build/samples/cpp/stable_diffusion/stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + ./build/samples/cpp/text2image/stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" lcm_dreamshaper_v7_cpp-windows: - runs-on: windows-latest - if: ${{ false }} # TODO: fix Windows + runs-on: windows-2019-16-core defaults: run: shell: pwsh @@ -89,6 +88,11 @@ jobs: mv ./tmp/*/* . popd + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + toolset: 14.29 + - name: Build app run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" @@ -116,9 +120,9 @@ jobs: optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16 - name: Run app - run: | + run: > . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - ./build/samples/cpp/stable_diffusion/Release/lcm_dreamshaper.exe ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + & "./build/samples/cpp/text2image/Release/stable_diffusion.exe ./models/lcm_dreamshaper_v7/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" Overall_Status: name: ci/gha_overall_status_lcm diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 45e6dc2941..be309c732d 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -40,11 +40,7 @@ jobs: python -m pip install --upgrade pip python -m pip install flake8 pytest black GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt - python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url -https://storage.openvinotoolkit.org/simple/wheels/nightly - GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt - GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }} - + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -74,6 +70,9 @@ https://storage.openvinotoolkit.org/simple/wheels/nightly python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./llm_bench/python/prompts/stable-diffusion.jsonl -d cpu -n 1 - name: WWB Tests run: | + GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt + pip install git+https://github.com/huggingface/optimum.git + GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }} python -m pytest llm_bench/python/who_what_benchmark/tests stateful: runs-on: ubuntu-20.04 @@ -86,13 +85,13 @@ https://storage.openvinotoolkit.org/simple/wheels/nightly run: | GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt python -m pip uninstall --yes openvino - python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url -https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful grep beam_idx pytorch/dldt/FP32/openvino_model.xml - name: WWB Tests run: | GIT_CLONE_PROTECTION_ACTIVE=false pip install -r llm_bench/python/who_what_benchmark/requirements.txt + pip install git+https://github.com/huggingface/optimum.git GIT_CLONE_PROTECTION_ACTIVE=false pip install llm_bench/python/who_what_benchmark/ pip install pytest python -m pytest llm_bench/python/who_what_benchmark/tests diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index c69287678d..ae6b7ce57b 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -63,15 +63,20 @@ jobs: run: | source openvino_sd_cpp/bin/activate optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format fp16 --task stable-diffusion models/dreamlike-art-dreamlike-anime-1.0/FP16 + wget -O ./models/soulcard.safetensors https://civitai.com/api/download/models/72591 - - name: Run app + - name: Run main app run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - ./build/samples/cpp/stable_diffusion/stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + ./build/samples/cpp/text2image/stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + + - name: Run LoRA app + run: | + source ${{ env.OV_INSTALL_DIR }}/setupvars.sh + ./build/samples/cpp/text2image/lora_stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7 stable_diffusion_1_5_cpp-windows: - runs-on: windows-latest - if: ${{ false }} # TODO: fix Windows + runs-on: windows-2019-16-core defaults: run: shell: pwsh @@ -89,6 +94,11 @@ jobs: mv ./tmp/*/* . popd + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + toolset: 14.29 + - name: Build app run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" @@ -114,11 +124,17 @@ jobs: run: | . "./openvino_sd_cpp/Scripts/Activate.ps1" optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike-art-dreamlike-anime-1.0/FP16 + Invoke-WebRequest -Uri 'https://civitai.com/api/download/models/72591' -OutFile 'models/soulcard.safetensors' - - name: Run app - run: | + - name: Run main app + run: > + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + & "./build/samples/cpp/text2image/Release/stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" + + - name: Run LoRA app + run: > . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - ./build/samples/cpp/stable_diffusion/Release/stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + & "./build/samples/cpp/text2image/Release/lora_stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'curly-haired unicorn in the forest, anime, line' ./models/soulcard.safetensors 0.7" Overall_Status: name: ci/gha_overall_status_stable_diffusion diff --git a/README.md b/README.md index f20ae1c268..6a7c325f69 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ It includes the following pipelines: 6. [multinomial_causal_lm](./samples/cpp/multinomial_causal_lm/README.md) 7. [prompt_lookup_decoding_lm](./samples/cpp/prompt_lookup_decoding_lm/README.md) 8. [speculative_decoding_lm](./samples/cpp/speculative_decoding_lm/README.md) -3. [Stable Diffuison and Latent Consistency Model (with LoRA) C++ image generation pipeline](./samples/cpp/stable_diffusion/README.md) +3. [Stable Diffuison and Latent Consistency Model (with LoRA) C++ image generation pipeline](./samples/cpp/text2image/README.md) ### Requirements diff --git a/llm_bench/python/README.md b/llm_bench/python/README.md index b49ad980ab..3ef58f113a 100755 --- a/llm_bench/python/README.md +++ b/llm_bench/python/README.md @@ -1,140 +1,165 @@ -# Benchmarking script for large language models +# Benchmarking Script for Large Language Models -This script provides a unified approach to estimate performance for Large Language Models. -It is based on pipelines provided by Optimum-Intel and allows to estimate performance for -pytorch and openvino models, using almost the same code and precollected models. +This script provides a unified approach to estimate performance for Large Language Models (LLMs). It leverages pipelines provided by Optimum-Intel and allows performance estimation for PyTorch and OpenVINO models using nearly identical code and pre-collected models. -## Usage -### 1. Start a Python virtual environment +### 1. Prepare Python Virtual Environment for LLM Benchmarking ``` bash -python3 -m venv python-env -source python-env/bin/activate +python3 -m venv ov-llm-bench-env +source ov-llm-bench-env/bin/activate pip install --upgrade pip -pip install -r requirements.txt + +git clone https://github.com/openvinotoolkit/openvino.genai.git +cd openvino.genai/llm_bench/python/ +pip install -r requirements.txt ``` -> Note: -> If you are using an existing python environment, recommend following command to use all the dependencies with latest versions: -> pip install -U --upgrade-strategy eager -r requirements.txt -### 2. Convert a model to OpenVINO IR - -The optimum-cli tool allows you to convert models from Hugging Face to the OpenVINO IR format. More detailed info about tool usage can be found in [Optimum Intel documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/export) +> Note: +> For existing Python environments, run the following command to ensure that all dependencies are installed with the latest versions: +> `pip install -U --upgrade-strategy eager -r requirements.txt` -Prerequisites: -install conversion dependencies using `requirements.txt` +#### (Optional) Hugging Face Login : -Usage: +Login to Hugging Face if you want to use non-public models: ```bash -optimum-cli export openvino --model --weight-format +huggingface-cli login ``` -Paramters: -* `--model ` - model_id for downloading from huggngface_hub (https://huggingface.co/models) or path with directory where pytorch model located. -* `--weight-format` - precision for model conversion fp32, fp16, int8, int4 -* `` - output directory for saving OpenVINO model. +### 2. Convert Model to OpenVINO IR Format + +The `optimum-cli` tool simplifies converting Hugging Face models to OpenVINO IR format. +- Detailed documentation can be found in the [Optimum-Intel documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/export). +- To learn more about weight compression, see the [NNCF Weight Compression Guide](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html). +- For additional guidance on running inference with OpenVINO for LLMs, see the [OpenVINO LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html). -Usage example: -```bash -optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format fp16 models/llama-2-7b-chat -``` +**Usage:** -the result of running the command will have the following file structure: +```bash +optimum-cli export openvino --model --weight-format - |-llama-2-7b-chat - |-pytorch - |-dldt - |-FP16 - |-openvino_model.xml - |-openvino_model.bin - |-config.json - |-generation_config.json - |-tokenizer_config.json - |-tokenizer.json - |-tokenizer.model - |-special_tokens_map.json +optimum-cli export openvino -h # For detailed information +``` -### 3. Benchmarking +* `--model ` : model_id for downloading from [huggngface_hub](https://huggingface.co/models) or path with directory where pytorch model located. +* `--weight-format ` : precision for model conversion. Available options: `fp32, fp16, int8, int4, mxfp4` +* ``: output directory for saving generated OpenVINO model. -Prerequisites: -install benchmarking dependencies using `requirements.txt` +**NOTE:** +- Models larger than 1 billion parameters are exported to the OpenVINO format with 8-bit weights by default. You can disable it with `--weight-format fp32`. -``` bash -pip install -r requirements.txt +**Example:** +```bash +optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format fp16 models/llama-2-7b-chat ``` -note: **You can specify the installed OpenVINO version through pip install** -``` bash -# e.g. -pip install openvino==2023.3.0 +**Resulting file structure:** + +```console + models + └── llama-2-7b-chat + ├── config.json + ├── generation_config.json + ├── openvino_detokenizer.bin + ├── openvino_detokenizer.xml + ├── openvino_model.bin + ├── openvino_model.xml + ├── openvino_tokenizer.bin + ├── openvino_tokenizer.xml + ├── special_tokens_map.json + ├── tokenizer_config.json + ├── tokenizer.json + └── tokenizer.model ``` -### 4. Run the following command to test the performance of one LLM model +### 3. Benchmark LLM Model + +To benchmark the performance of the LLM, use the following command: + ``` bash python benchmark.py -m -d -r -f -p -n # e.g. -python benchmark.py -m models/llama-2-7b-chat/pytorch/dldt/FP32 -n 2 -python benchmark.py -m models/llama-2-7b-chat/pytorch/dldt/FP32 -p "What is openvino?" -n 2 -python benchmark.py -m models/llama-2-7b-chat/pytorch/dldt/FP32 -pf prompts/llama-2-7b-chat_l.jsonl -n 2 +python benchmark.py -m models/llama-2-7b-chat/ -n 2 +python benchmark.py -m models/llama-2-7b-chat/ -p "What is openvino?" -n 2 +python benchmark.py -m models/llama-2-7b-chat/ -pf prompts/llama-2-7b-chat_l.jsonl -n 2 ``` -Parameters: -* `-m` - model path -* `-d` - inference device (default=cpu) -* `-r` - report csv -* `-f` - framework (default=ov) -* `-p` - interactive prompt text -* `-pf` - path of JSONL file including interactive prompts -* `-n` - number of benchmarking iterations, if the value greater 0, will exclude the first iteration. (default=0) -* `-ic` - limit the output token size (default 512) of text_gen and code_gen models. - +**Parameters:** +- `-m`: Path to the model. +- `-d`: Inference device (default: CPU). +- `-r`: Path to the CSV report. +- `-f`: Framework (default: ov). +- `-p`: Interactive prompt text. +- `-pf`: Path to a JSONL file containing prompts. +- `-n`: Number of iterations (default: 0, the first iteration is excluded). +- `-ic`: Limit the output token size (default: 512) for text generation and code generation models. + +**Additional options:** ``` bash python ./benchmark.py -h # for more information ``` -## Running `torch.compile()` +#### Benchmarking the Original PyTorch Model: +To benchmark the original PyTorch model, first download the model locally and then run benchmark by specifying PyTorch as the framework with parameter `-f pt` -The option `--torch_compile_backend` uses `torch.compile()` to speed up -the PyTorch code by compiling it into optimized kernels using a selected backend. +```bash +# Download PyTorch Model +huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir models/llama-2-7b-chat/pytorch +# Benchmark with PyTorch Framework +python benchmark.py -m models/llama-2-7b-chat/pytorch -n 2 -f pt +``` -Prerequisites: install benchmarking dependencies using requirements.txt +> **Note:** If needed, You can install a specific OpenVINO version using pip: +> ``` bash +> # e.g. +> pip install openvino==2024.4.0 +> # Optional, install the openvino nightly package if needed. +> # OpenVINO nightly is pre-release software and has not undergone full release validation or qualification. +> pip uninstall openvino +> pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly +> ``` -``` bash -pip install -r requirements.txt -``` +## 4. Benchmark LLM with `torch.compile()` + +The `--torch_compile_backend` option enables you to use `torch.compile()` to accelerate PyTorch models by compiling them into optimized kernels using a specified backend. -In order to run the `torch.compile()` on CUDA GPU, install additionally the nightly PyTorch version: +Before benchmarking, you need to download the original PyTorch model. Use the following command to download the model locally: ```bash -pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 +huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir models/llama-2-7b-chat/pytorch ``` -Add the option `--torch_compile_backend` with the desired backend: `pytorch` or `openvino` (default) while running the benchmarking script: +To run the benchmarking script with `torch.compile()`, use the `--torch_compile_backend` option to specify the backend. You can choose between `pytorch` or `openvino` (default). Example: ```bash python ./benchmark.py -m models/llama-2-7b-chat/pytorch -d CPU --torch_compile_backend openvino ``` -## Run on 2 sockets platform +> **Note:** To use `torch.compile()` with CUDA GPUs, you need to install the nightly version of PyTorch: +> +> ```bash +> pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 +> ``` + -benchmark.py sets openvino.properties.streams.num(1) by default +## 5. Running on 2-Socket Platforms -| OpenVINO version | Behaviors | +The benchmarking script sets `openvino.properties.streams.num(1)` by default. For multi-socket platforms, use `numactl` on Linux or the `--load_config` option to modify behavior. + +| OpenVINO Version | Behaviors | |:--------------------|:------------------------------------------------| -| Before 2024.0.0 | streams.num(1)
execute on 2 sockets. | -| 2024.0.0 | streams.num(1)
execute on the same socket as the APP is running on. | +| Before 2024.0.0 | streams.num(1)
execute on 2 sockets. | +| 2024.0.0 | streams.num(1)
execute on the same socket as the APP is running on. | -numactl on Linux or --load_config for benchmark.py can be used to change the behaviors. +For example, `--load_config config.json` as following will result in streams.num(1) and execute on 2 sockets. +```json +{ + "INFERENCE_NUM_THREADS": +} +``` +`` is the number of total physical cores in 2 sockets. -For example, --load_config config.json as following in OpenVINO 2024.0.0 will result in streams.num(1) and execute on 2 sockets. -``` -{"INFERENCE_NUM_THREADS":} -``` -`` is the number of total physical cores in 2 sockets +## 6. Additional Resources -## Additional Resources -### 1. NOTE -> If you encounter any errors, please check **[NOTES.md](./doc/NOTES.md)** which provides solutions to the known errors. -### 2. Image generation -> To configure more parameters for image generation models, reference to **[IMAGE_GEN.md](./doc/IMAGE_GEN.md)** +- **Error Troubleshooting:** Check the [NOTES.md](./doc/NOTES.md) for solutions to known issues. +- **Image Generation Configuration:** Refer to [IMAGE_GEN.md](./doc/IMAGE_GEN.md) for setting parameters for image generation models. \ No newline at end of file diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index 7fb6d1757b..9dcfe74f66 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -308,13 +308,14 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " f"is different from md5 of the {num - 1} iteration {prev_md5}") llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) - if num == 1: - # if the device is CPU, throw exception - if args['devices'].lower().startswith('cpu') is True: + if not args.get("use_cb", False): + if num == 1: + # if the device is CPU, throw exception + if args['devices'].lower().startswith('cpu') is True: + assert (result_md5_list == prev_md5) + else: + # throw exception assert (result_md5_list == prev_md5) - else: - # throw exception - assert (result_md5_list == prev_md5) else: llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) @@ -814,7 +815,7 @@ def get_argprser(): llm_bench_utils.model_utils.add_stateful_model_arguments(parser) parser.add_argument("--genai", action="store_true", help="Use OpenVINO GenAI optimized pipelines for benchmarking") parser.add_argument("--use_cb", action="store_true", help="Use Continuous Batching inference mode") - parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings") + parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings or dict") parser.add_argument( '--end_token_stopping', action='store_true', diff --git a/llm_bench/python/convert.py b/llm_bench/python/convert.py index ae676bc269..49cea02c11 100644 --- a/llm_bench/python/convert.py +++ b/llm_bench/python/convert.py @@ -1464,6 +1464,8 @@ def main(): add_stateful_model_arguments(parser) args = parser.parse_args() + log.warning("[DEPRECATED] Not for production use! Please use the 'optimum-intel' to generate the IRs. For details, please check:" + " https://github.com/openvinotoolkit/openvino.genai/blob/master/llm_bench/python/README.md#2-convert-model-to-openvino-ir-format") log.info(f"openvino runtime version: {get_version()}") model_type = get_convert_model_type(args.model_id.lower()) converter = converters[model_type] diff --git a/llm_bench/python/llm_bench_utils/model_utils.py b/llm_bench/python/llm_bench_utils/model_utils.py index b35d7be47b..3d5359e26c 100644 --- a/llm_bench/python/llm_bench_utils/model_utils.py +++ b/llm_bench/python/llm_bench_utils/model_utils.py @@ -204,11 +204,17 @@ def get_use_case(model_name_or_path): def get_config(config): - with open(config, 'r') as f: + if Path(config).is_file(): + with open(config, 'r') as f: + try: + ov_config = json.load(f) + except Exception: + raise RuntimeError(f'==Parse file:{config} failiure, json format is incorrect ==') + else: try: - ov_config = json.load(f) + ov_config = json.loads(config) except Exception: - raise RuntimeError(f'==Parse file:{config} failiure, json format is incorrect ==') + raise RuntimeError(f'==Parse config:{config} failiure, json format is incorrect ==') return ov_config diff --git a/llm_bench/python/llm_bench_utils/ov_utils.py b/llm_bench/python/llm_bench_utils/ov_utils.py index b9434c5f3d..da77f5da22 100644 --- a/llm_bench/python/llm_bench_utils/ov_utils.py +++ b/llm_bench/python/llm_bench_utils/ov_utils.py @@ -189,11 +189,14 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): cb = kwargs.get("use_cb", False) if cb: log.info("Continuous Batching mode activated") + default_cb_config = {"cache_size": 1} + if "GPU" in device: + default_cb_config["block_size"] = 16 scheduler_config = openvino_genai.SchedulerConfig() - scheduler_params = kwargs.get("cb_config") or {"cache_size": 1} + scheduler_params = kwargs.get("cb_config") or default_cb_config if scheduler_params: log.info(f"Scheduler parameters:\n{scheduler_params}") - + for param, value in scheduler_params.items(): setattr(scheduler_config, param, value) ov_config["scheduler_config"] = scheduler_config @@ -209,19 +212,24 @@ def __init__(self, tokenizer): self.token_generation_time = [] self.generated_tokens = [] self.start_time = time.perf_counter() + def put(self, token_id): self.token_generation_time.append(time.perf_counter() - self.start_time) self.generated_tokens.append(token_id) self.start_time = time.perf_counter() return False + def reset(self): self.token_generation_time = [] self.generated_tokens = [] self.start_time = time.perf_counter() + def end(self): pass + def get_tokens(self): return self.generated_tokens + def get_time_list(self): return self.token_generation_time streamer = TokenStreamer(llm_pipe.get_tokenizer()) if cb else None diff --git a/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py b/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py index 12fc726f38..5781ddf229 100644 --- a/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py +++ b/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py @@ -6,7 +6,13 @@ from whowhatbench.wwb import load_dataset from optimum.intel.openvino import OVModelForCausalLM -from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationConfig, CacheEvictionConfig, AggregationMode +from openvino_genai import ( + ContinuousBatchingPipeline, + SchedulerConfig, + GenerationConfig, + CacheEvictionConfig, + AggregationMode, +) from openvino_tokenizers import convert_tokenizer from openvino import serialize @@ -18,12 +24,16 @@ MAX_SEQUENCES = 100 -model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) +model = OVModelForCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=True +) tokenizer = AutoTokenizer.from_pretrained(model_id) model_path = PosixPath(tempfile.gettempdir()) / model_id model.save_pretrained(model_path) -ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True) +ov_tokenizer, ov_detokenizer = convert_tokenizer( + tokenizer, with_detokenizer=True, skip_special_tokens=True +) serialize(ov_tokenizer, model_path / "openvino_tokenizer.xml") serialize(ov_detokenizer, model_path / "openvino_detokenizer.xml") @@ -48,24 +58,39 @@ generation_config.num_return_sequences = 1 generation_config.max_new_tokens = MAX_NEW_TOKENS -data = load_dataset(path='squad', name=None, split='validation')["context"] -data_dict = {"questions": list(dict({k: None for k in data}).keys())[:MAX_SEQUENCES]} +data = load_dataset(path="squad", name=None, split="validation")["context"] +data_dict = {"prompts": list(dict({k: None for k in data}).keys())[:MAX_SEQUENCES]} -model_cb_noopt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_noopt, "CPU", {}) -model_cb_opt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {}) +model_cb_noopt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config_noopt, "CPU", {} +) +model_cb_opt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {} +) -GT_DATA_FILE = 'gt_data.csv' +GT_DATA_FILE = "gt_data.csv" if os.path.exists(GT_DATA_FILE): - evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, gt_data=GT_DATA_FILE, tokenizer=tokenizer, - test_data=data_dict, generation_config=generation_config, - max_new_tokens=MAX_NEW_TOKENS, seqs_per_request=3) + evaluator = whowhatbench.TextEvaluator( + base_model=model_cb_noopt, + gt_data=GT_DATA_FILE, + tokenizer=tokenizer, + test_data=data_dict, + generation_config=generation_config, + max_new_tokens=MAX_NEW_TOKENS, + seqs_per_request=3, + ) else: - evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict, - generation_config=generation_config, max_new_tokens=MAX_NEW_TOKENS, - seqs_per_request=3) - evaluator.dump_gt('gt_data.csv') + evaluator = whowhatbench.TextEvaluator( + base_model=model_cb_noopt, + tokenizer=tokenizer, + test_data=data_dict, + generation_config=generation_config, + max_new_tokens=MAX_NEW_TOKENS, + seqs_per_request=3, + ) + evaluator.dump_gt("gt_data.csv") all_metrics_per_question, all_metrics = evaluator.score(model_cb_opt) @@ -89,8 +114,18 @@ pipeline_opt_metrics = model_cb_opt.get_metrics() pipeline_noopt_metrics = model_cb_noopt.get_metrics() -print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}") -print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}") -max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage) -avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage) -print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x") +print( + f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}" +) +print( + f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}" +) +max_optimization_ratio = ( + pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage +) +avg_optimization_ratio = ( + pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage +) +print( + f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x" +) diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli_image.py b/llm_bench/python/who_what_benchmark/tests/test_cli_image.py new file mode 100644 index 0000000000..f4c10eac86 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/tests/test_cli_image.py @@ -0,0 +1,98 @@ +import subprocess # nosec B404 +import os +import shutil +import pytest +import logging + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def run_wwb(args): + logger.info(" ".join(["wwb"] + args)) + result = subprocess.run(["wwb"] + args, capture_output=True, text=True) + logger.info(result) + return result + + +@pytest.mark.parametrize( + ("model_id", "model_type", "backend"), + [ + ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "hf"), + ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "openvino"), + ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "sd-xl", "hf"), + ], +) +def test_image_model_types(model_id, model_type, backend): + GT_FILE = "test_sd.json" + wwb_args = [ + "--base-model", + model_id, + "--target-model", + model_id, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + ] + if backend == "hf": + wwb_args.append("--hf") + + result = run_wwb(wwb_args) + print(f"WWB result: {result}, {result.stderr}") + + try: + os.remove(GT_FILE) + except OSError: + pass + shutil.rmtree("reference", ignore_errors=True) + shutil.rmtree("target", ignore_errors=True) + + assert result.returncode == 0 + assert "Metrics for model" in result.stderr + assert "## Reference text" not in result.stderr + + +@pytest.mark.parametrize( + ("model_id", "model_type", "backend"), + [ + ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "hf"), + ], +) +def test_image_custom_dataset(model_id, model_type, backend): + GT_FILE = "test_sd.json" + wwb_args = [ + "--base-model", + model_id, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--dataset", + "google-research-datasets/conceptual_captions", + "--dataset-field", + "caption", + ] + if backend == "hf": + wwb_args.append("--hf") + + result = run_wwb(wwb_args) + + assert os.path.exists(GT_FILE) + + try: + os.remove(GT_FILE) + except OSError: + pass + shutil.rmtree("reference", ignore_errors=True) + + assert result.returncode == 0 diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli.py b/llm_bench/python/who_what_benchmark/tests/test_cli_text.py similarity index 51% rename from llm_bench/python/who_what_benchmark/tests/test_cli.py rename to llm_bench/python/who_what_benchmark/tests/test_cli_text.py index 8110e98335..161a9afb72 100644 --- a/llm_bench/python/who_what_benchmark/tests/test_cli.py +++ b/llm_bench/python/who_what_benchmark/tests/test_cli_text.py @@ -16,11 +16,7 @@ def run_wwb(args): logger.info(" ".join(["wwb"] + args)) - result = subprocess.run( - ["wwb"] + args, - capture_output=True, - text=True - ) + result = subprocess.run(["wwb"] + args, capture_output=True, text=True) logger.info(result) return result @@ -54,13 +50,21 @@ def teardown_module(): shutil.rmtree(tmp_dir) -def test_target_model(): - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU" - ]) +def test_text_target_model(): + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--model-type", + "text", + ] + ) assert result.returncode == 0 assert "Metrics for model" in result.stderr @@ -68,19 +72,28 @@ def test_target_model(): @pytest.fixture -def test_gt_data(): +def test_text_gt_data(): with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: temp_file_name = tmpfile.name - result = run_wwb([ - "--base-model", base_model_path, - "--gt-data", temp_file_name, - "--dataset", "EleutherAI/lambada_openai,en", - "--dataset-field", "text", - "--split", "test", - "--num-samples", "2", - "--device", "CPU" - ]) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--gt-data", + temp_file_name, + "--dataset", + "EleutherAI/lambada_openai,en", + "--dataset-field", + "text", + "--split", + "test", + "--num-samples", + "2", + "--device", + "CPU", + ] + ) data = pd.read_csv(temp_file_name) os.remove(temp_file_name) @@ -88,76 +101,107 @@ def test_gt_data(): assert len(data["questions"].values) == 2 -def test_output_directory(): +def test_text_output_directory(): with tempfile.TemporaryDirectory() as temp_dir: - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU", - "--output", temp_dir - ]) + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--output", + temp_dir, + ] + ) assert result.returncode == 0 assert "Metrics for model" in result.stderr assert os.path.exists(os.path.join(temp_dir, "metrics_per_qustion.csv")) assert os.path.exists(os.path.join(temp_dir, "metrics.csv")) -def test_verbose(): - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU", - "--verbose" - ]) +def test_text_verbose(): + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--verbose", + ] + ) assert result.returncode == 0 assert "## Diff " in result.stderr -def test_language_autodetect(): +def test_text_language_autodetect(): with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: temp_file_name = tmpfile.name - result = run_wwb([ - "--base-model", "Qwen/Qwen2-0.5B", - "--gt-data", temp_file_name, - "--num-samples", "2", - "--device", "CPU" - ]) + result = run_wwb( + [ + "--base-model", + "Qwen/Qwen2-0.5B", + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + ] + ) data = pd.read_csv(temp_file_name) os.remove(temp_file_name) assert result.returncode == 0 - assert "马克" in data["questions"].values[0] + assert "马克" in data["prompts"].values[0] -def test_hf_model(): +def test_text_hf_model(): with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: temp_file_name = tmpfile.name - result = run_wwb([ - "--base-model", model_id, - "--gt-data", temp_file_name, - "--num-samples", "2", - "--device", "CPU", - "--hf" - ]) + result = run_wwb( + [ + "--base-model", + model_id, + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + "--hf", + ] + ) data = pd.read_csv(temp_file_name) os.remove(temp_file_name) assert result.returncode == 0 - assert len(data["questions"].values) == 2 - - -def test_genai_model(): - result = run_wwb([ - "--base-model", base_model_path, - "--target-model", target_model_path, - "--num-samples", "2", - "--device", "CPU", - "--genai" - ]) + assert len(data["prompts"].values) == 2 + + +def test_text_genai_model(): + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--genai", + ] + ) assert result.returncode == 0 assert "Metrics for model" in result.stderr assert "## Reference text" not in result.stderr diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py b/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py index 86f428ddd7..4d61b0d086 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py @@ -1,4 +1,13 @@ -"""Who what benchmark APIs.""" -from .evaluator import Evaluator +from .registry import register_evaluator, MODELTYPE2TASK, EVALUATOR_REGISTRY +from .text_evaluator import TextEvaluator +from .text_evaluator import TextEvaluator as Evaluator +from .text2image_evaluator import Text2ImageEvaluator -__all__ = ["Evaluator"] +__all__ = [ + "Evaluator", + "register_evaluator", + "TextEvaluator", + "Text2ImageEvaluator", + "MODELTYPE2TASK", + "EVALUATOR_REGISTRY", +] diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/registry.py b/llm_bench/python/who_what_benchmark/whowhatbench/registry.py new file mode 100644 index 0000000000..208ba60ff3 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/registry.py @@ -0,0 +1,50 @@ +from abc import ABC, abstractmethod + +from optimum.intel import ( + OVLatentConsistencyModelPipeline, + OVStableDiffusionPipeline, + OVStableDiffusionXLPipeline, +) + + +# Registry for evaluators +EVALUATOR_REGISTRY = {} +MODELTYPE2TASK = { + "text": "text-generation", + "sd": "image-generation", + "sd-xl": "image-generation", + "sd-lcm": "image-generation", +} + +TEXT2IMAGE_TASK2CLASS = { + "sd": OVStableDiffusionPipeline, + "sd-xl": OVStableDiffusionXLPipeline, + "sd-lcm": OVLatentConsistencyModelPipeline, +} + + +def register_evaluator(*names): + def decorate(cls): + for name in names: + assert ( + name not in EVALUATOR_REGISTRY + ), f"Evaluator named '{name}' conflicts with existing evaluators! Please register with a non-conflicting alias instead." + + EVALUATOR_REGISTRY[name] = cls + return cls + + return decorate + + +class BaseEvaluator(ABC): + @abstractmethod + def dump_gt(self, csv_name: str): + pass + + @abstractmethod + def score(self, model, **kwargs): + pass + + @abstractmethod + def worst_examples(self, top_k: int = 5, metric="similarity"): + pass diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py b/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py new file mode 100644 index 0000000000..b8b8234547 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py @@ -0,0 +1,157 @@ +import os +from typing import Any, Union + +import pandas as pd +from tqdm import tqdm +from transformers import set_seed +import torch + +from .registry import register_evaluator, BaseEvaluator + +from .whowhat_metrics import ImageSimilarity + +default_data = { + "prompts": [ + "Cinematic, a vibrant Mid-century modern dining area, colorful chairs and a sideboard, ultra realistic, many detail", + "colibri flying near a flower, side view, forest background, natural light, photorealistic, 4k", + "Illustration of an astronaut sitting in outer space, moon behind him", + "A vintage illustration of a retro computer, vaporwave aesthetic, light pink and light blue", + "A view from beautiful alien planet, very beautiful, surealism, retro astronaut on the first plane, 8k photo", + ], +} + + +@register_evaluator("image-generation") +class Text2ImageEvaluator(BaseEvaluator): + def __init__( + self, + base_model: Any = None, + gt_data: str = None, + test_data: Union[str, list] = None, + metrics="similarity", + similarity_model_id: str = "openai/clip-vit-large-patch14", + resolution=(512, 512), + num_inference_steps=4, + crop_prompts=True, + num_samples=None, + gen_image_fn=None, + seed=42, + ) -> None: + assert ( + base_model is not None or gt_data is not None + ), "Text generation pipeline for evaluation or ground trush data must be defined" + + self.test_data = test_data + self.metrics = metrics + self.resolution = resolution + self.crop_prompt = crop_prompts + self.num_samples = num_samples + self.num_inference_steps = num_inference_steps + self.seed = seed + self.similarity = None + self.similarity = ImageSimilarity(similarity_model_id) + self.last_cmp = None + self.gt_dir = os.path.dirname(gt_data) + if base_model: + self.gt_data = self._generate_data( + base_model, gen_image_fn, os.path.join(self.gt_dir, "reference") + ) + else: + self.gt_data = pd.read_csv(gt_data, keep_default_na=False) + + def dump_gt(self, csv_name: str): + self.gt_data.to_csv(csv_name) + + def score(self, model, gen_image_fn=None): + predictions = self._generate_data( + model, gen_image_fn, os.path.join(self.gt_dir, "target") + ) + + all_metrics_per_prompt = {} + all_metrics = {} + + if self.similarity: + metric_dict, metric_per_question = self.similarity.evaluate( + self.gt_data, predictions + ) + all_metrics.update(metric_dict) + all_metrics_per_prompt.update(metric_per_question) + + self.last_cmp = all_metrics_per_prompt + self.last_cmp["prompts"] = predictions["prompts"].values + self.last_cmp["source_model"] = self.gt_data["images"].values + self.last_cmp["optimized_model"] = predictions["images"].values + self.last_cmp = pd.DataFrame(self.last_cmp) + + return pd.DataFrame(all_metrics_per_prompt), pd.DataFrame([all_metrics]) + + def worst_examples(self, top_k: int = 5, metric="similarity"): + assert self.last_cmp is not None + + res = self.last_cmp.nsmallest(top_k, metric) + res = list(row for idx, row in res.iterrows()) + + return res + + def _generate_data(self, model, gen_image_fn=None, image_dir="reference"): + if hasattr(model, "reshape") and self.resolution is not None: + model.reshape( + batch_size=1, + height=self.resolution[0], + width=self.resolution[1], + num_images_per_prompt=1, + ) + + def default_gen_image_fn(model, prompt, num_inference_steps, generator=None): + output = model( + prompt, + num_inference_steps=num_inference_steps, + output_type="pil", + width=self.resolution[0], + height=self.resolution[0], + generator=generator, + ) + return output.images[0] + + gen_image_fn = gen_image_fn or default_gen_image_fn + + if self.test_data: + if isinstance(self.test_data, str): + data = pd.read_csv(self.test_data) + else: + if isinstance(self.test_data, dict): + assert "prompts" in self.test_data + data = dict(self.test_data) + else: + data = {"prompts": list(self.test_data)} + data = pd.DataFrame.from_dict(data) + else: + data = pd.DataFrame.from_dict(default_data) + + prompts = data["prompts"] + prompts = ( + prompts.values + if self.num_samples is None + else prompts.values[: self.num_samples] + ) + images = [] + rng = torch.Generator(device="cpu") + + if not os.path.exists(image_dir): + os.makedirs(image_dir) + for i, prompt in tqdm(enumerate(prompts), desc="Evaluate pipeline"): + set_seed(self.seed) + image = gen_image_fn( + model, + prompt, + self.num_inference_steps, + generator=rng.manual_seed(self.seed), + ) + image_path = os.path.join(image_dir, f"{i}.png") + image.save(image_path) + images.append(image_path) + + res_data = {"prompts": list(prompts), "images": images} + df = pd.DataFrame(res_data) + + return df diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py b/llm_bench/python/who_what_benchmark/whowhatbench/text_evaluator.py similarity index 71% rename from llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py rename to llm_bench/python/who_what_benchmark/whowhatbench/text_evaluator.py index bb0d17e34e..436d2be034 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/text_evaluator.py @@ -3,11 +3,12 @@ import pandas as pd from tqdm import tqdm -from .whowhat_metrics import DivergencyMetric, SimilarityMetric +from .registry import register_evaluator, BaseEvaluator +from .whowhat_metrics import TextDivergency, TextSimilarity default_data = { - "en" : { - "questions": [ + "en": { + "prompts": [ "Who is Mark Twain?", "Who is William Shakespeare?", "Who is Agatha Christie?", @@ -38,12 +39,12 @@ ], }, "cn": { - "questions": [ + "prompts": [ "马克吐温是谁?", "谁是威廉-莎士比亚?", "阿加莎-克里斯蒂是谁?", "芭芭拉-卡特兰是谁?", - "丹妮尔-斯蒂尔是谁?" + "丹妮尔-斯蒂尔是谁?", "谁是哈罗德-罗宾斯?", "乔治-西默农是谁?", "伊妮德-布莱顿是谁?", @@ -86,7 +87,10 @@ def autodetect_language(model): return model2language.get(model.config.model_type, "en") -class Evaluator: +@register_evaluator( + "text-generation", "text-generation-with-past", "text2text-generation" +) +class TextEvaluator(BaseEvaluator): def __init__( self, base_model: Any = None, @@ -102,7 +106,7 @@ def __init__( gen_answer_fn=None, generation_config=None, generation_config_base=None, - seqs_per_request=None + seqs_per_request=None, ) -> None: assert ( base_model is not None or gt_data is not None @@ -127,7 +131,9 @@ def __init__( self.language = autodetect_language(base_model) if base_model: - self.gt_data = self._generate_data(base_model, gen_answer_fn, generation_config=generation_config) + self.gt_data = self._generate_data( + base_model, gen_answer_fn, generation_config=generation_config + ) else: self.gt_data = pd.read_csv(gt_data, keep_default_na=False) @@ -138,10 +144,10 @@ def __init__( self.similarity = None self.divergency = None if "similarity" in self.metrics: - self.similarity = SimilarityMetric(similarity_model_id) + self.similarity = TextSimilarity(similarity_model_id) if "divergency" in self.metrics: assert tokenizer is not None - self.divergency = DivergencyMetric(tokenizer) + self.divergency = TextDivergency(tokenizer) self.last_cmp = None @@ -151,7 +157,7 @@ def dump_gt(self, csv_name: str): def score(self, model, gen_answer_fn=None): predictions = self._generate_data(model, gen_answer_fn, self.generation_config) - all_metrics_per_question = {} + all_metrics_per_prompt = {} all_metrics = {} if self.similarity: @@ -159,23 +165,23 @@ def score(self, model, gen_answer_fn=None): self.gt_data, predictions ) all_metrics.update(metric_dict) - all_metrics_per_question.update(metric_per_question) + all_metrics_per_prompt.update(metric_per_question) if self.divergency: metric_dict, metric_per_question = self.divergency.evaluate( self.gt_data, predictions ) all_metrics.update(metric_dict) - all_metrics_per_question.update(metric_per_question) + all_metrics_per_prompt.update(metric_per_question) - self.last_cmp = all_metrics_per_question - self.last_cmp["questions"] = predictions["questions"].values + self.last_cmp = all_metrics_per_prompt + self.last_cmp["prompts"] = predictions["prompts"].values self.last_cmp["source_model"] = self.gt_data["answers"].values self.last_cmp["optimized_model"] = predictions["answers"].values self.last_cmp = pd.DataFrame(self.last_cmp) - self.last_cmp.rename(columns={"questions": "prompt"}, inplace=True) + self.last_cmp.rename(columns={"prompts": "prompt"}, inplace=True) - return pd.DataFrame(all_metrics_per_question), pd.DataFrame([all_metrics]) + return pd.DataFrame(all_metrics_per_prompt), pd.DataFrame([all_metrics]) def worst_examples(self, top_k: int = 5, metric="similarity"): assert self.last_cmp is not None @@ -190,12 +196,12 @@ def worst_examples(self, top_k: int = 5, metric="similarity"): return res def _generate_data(self, model, gen_answer_fn=None, generation_config=None): - def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question): - inputs = self.tokenizer(question, return_tensors="pt") + def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question): + inputs = self.tokenizer(prompt, return_tensors="pt") - tokens = model.generate(**inputs, max_new_tokens=max_new_tokens) + tokens = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens) out = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0] - return out[len(question) :] if crop_question else out + return out[len(prompt) :] if crop_question else out gen_answer_fn = gen_answer_fn or default_gen_answer @@ -204,39 +210,58 @@ def default_gen_answer(model, tokenizer, question, max_new_tokens, crop_question data = pd.read_csv(self.test_data) else: if isinstance(self.test_data, dict): - assert "questions" in self.test_data + assert "prompts" in self.test_data data = dict(self.test_data) else: - data = {"questions": list(self.test_data)} + data = {"prompts": list(self.test_data)} data = pd.DataFrame.from_dict(data) else: if self.language is None: - print("No language detecting in the base model or ground truth data. Taking language from target model.") + print( + "No language detecting in the base model or ground truth data. Taking language from target model." + ) self.language = autodetect_language(model) data = pd.DataFrame.from_dict(default_data[self.language]) - questions = data["questions"] + prompt_data = data["prompts"] answers = [] - prompts = questions.values if self.num_samples is None else questions.values[:self.num_samples] + prompts = ( + prompt_data.values + if self.num_samples is None + else prompt_data.values[: self.num_samples] + ) if generation_config is None: - for q in tqdm(prompts, desc="Evaluate pipeline"): - answers.append(gen_answer_fn(model, self.tokenizer, q, self.max_new_tokens, self._crop_question)) + for p in tqdm(prompts, desc="Evaluate pipeline"): + answers.append( + gen_answer_fn( + model, + self.tokenizer, + p, + self.max_new_tokens, + self._crop_question, + ) + ) else: - with tqdm(total=len(questions.values)) as progress_bar: + with tqdm(total=len(prompt_data.values)) as progress_bar: batch = [] - for q_idx, q in enumerate(questions.values): + for p_idx, p in enumerate(prompt_data.values): progress_bar.update(1) - batch.append(q) - if len(batch) == self.seqs_per_request or q_idx == len(questions.values) - 1: - ans_batch = model.generate(batch, [generation_config] * len(batch)) + batch.append(p) + if ( + len(batch) == self.seqs_per_request + or p_idx == len(prompt_data.values) - 1 + ): + ans_batch = model.generate( + batch, [generation_config] * len(batch) + ) for ans in ans_batch: answers.append(ans.m_generation_ids[0]) batch.clear() - res_data = {"questions": list(prompts), "answers": answers} + res_data = {"prompts": list(prompts), "answers": answers} df = pd.DataFrame(res_data) df["language"] = self.language diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py b/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py index 83157e05ca..bbf96a3312 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py @@ -1,10 +1,15 @@ """ Metrics for text similarity """ + from difflib import SequenceMatcher +from PIL import Image +import torch +import torch.nn.functional as F import numpy as np from sentence_transformers import SentenceTransformer, util +from transformers import CLIPImageProcessor, CLIPModel from tqdm import tqdm @@ -68,9 +73,7 @@ def evaluate_divergency(tokenizer, data_gold, data_prediction): fdt_list.append(fdt) num_matched = sum(block.size for block in blocks) - sdt = ( - len(b_indexes) - num_matched - ) + sdt = len(b_indexes) - num_matched sdt_list.append(sdt) sdt_norm = sdt / len(b_indexes) sdtn_list.append(sdt_norm) @@ -104,7 +107,7 @@ def evaluate_divergency(tokenizer, data_gold, data_prediction): return metric_dict, metric_per_question -class SimilarityMetric: +class TextSimilarity: def __init__(self, model_id) -> None: self.model = SentenceTransformer(model_id) @@ -112,9 +115,47 @@ def evaluate(self, gt, prediction): return evaluate_similarity(self.model, gt, prediction) -class DivergencyMetric: +class TextDivergency: def __init__(self, tokenizer) -> None: self.tokenizer = tokenizer def evaluate(self, gt, prediction): return evaluate_divergency(self.tokenizer, gt, prediction) + + +# Image metrics +def evaluate_image_similarity(processor, model, data_gold, data_prediction): + images_gold = data_gold["images"].values + images_prediction = data_prediction["images"].values + + metric_per_image = [] + for gold, prediction in tqdm( + zip(images_gold, images_prediction), desc="Image Similarity evaluation" + ): + gold_image = Image.open(gold) + prediction_image = Image.open(prediction) + + gold_inputs = processor(images=gold_image, return_tensors="pt")["pixel_values"] + prediction_inputs = processor(images=prediction_image, return_tensors="pt")[ + "pixel_values" + ] + + with torch.no_grad(): + gold_outputs = model.get_image_features(gold_inputs) + prediction_outputs = model.get_image_features(prediction_inputs) + + cos_sim = F.cosine_similarity(gold_outputs, prediction_outputs) + print("cos_sim: ", cos_sim.item()) + metric_per_image.append(cos_sim.item()) + + metric_dict = {"similarity": np.mean(metric_per_image)} + return metric_dict, {"similarity": metric_per_image} + + +class ImageSimilarity: + def __init__(self, model_id) -> None: + self.processor = CLIPImageProcessor.from_pretrained(model_id) + self.model = CLIPModel.from_pretrained(model_id).eval() + + def evaluate(self, gt, prediction): + return evaluate_image_similarity(self.processor, self.model, gt, prediction) diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py index 8efca22059..3798bb044c 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py @@ -5,28 +5,40 @@ import pandas as pd import logging from datasets import load_dataset -from optimum.exporters import TasksManager +from diffusers import DiffusionPipeline from optimum.intel.openvino import OVModelForCausalLM from optimum.utils import NormalizedConfigManager, NormalizedTextConfig from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM -from . import Evaluator +from optimum.exporters.tasks import TasksManager +from optimum.intel import ( + OVLatentConsistencyModelPipeline, + OVStableDiffusionPipeline, + OVStableDiffusionXLPipeline, +) + +import openvino_genai +from whowhatbench import EVALUATOR_REGISTRY, MODELTYPE2TASK + # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -TasksManager._SUPPORTED_MODEL_TYPE["stablelm-epoch"] = TasksManager._SUPPORTED_MODEL_TYPE["llama"] +TasksManager._SUPPORTED_MODEL_TYPE["stablelm-epoch"] = ( + TasksManager._SUPPORTED_MODEL_TYPE["llama"] +) NormalizedConfigManager._conf["stablelm-epoch"] = NormalizedTextConfig.with_args( num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", ) -class GenAIModelWrapper(): +class GenAIModelWrapper: """ A helper class to store additional attributes for GenAI models """ + def __init__(self, model, model_dir): self.model = model self.config = AutoConfig.from_pretrained(model_dir) @@ -38,7 +50,7 @@ def __getattr__(self, attr): return getattr(self.model, attr) -def load_genai_pipeline(model_dir, device="CPU"): +def load_text_genai_pipeline(model_dir, device="CPU"): try: import openvino_genai except ImportError: @@ -48,13 +60,17 @@ def load_genai_pipeline(model_dir, device="CPU"): return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device), model_dir) -def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False): +def load_text_model( + model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): if use_hf: logger.info("Using HF Transformers API") - return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device_map=device.lower()) + return AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower() + ) if use_genai: - return load_genai_pipeline(model_id, device) + return load_text_genai_pipeline(model_id, device) if ov_config: with open(ov_config) as f: @@ -62,7 +78,9 @@ def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=F else: ov_options = None try: - model = OVModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device=device, ov_config=ov_options) + model = OVModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_options + ) except ValueError: config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) model = OVModelForCausalLM.from_pretrained( @@ -71,11 +89,67 @@ def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=F trust_remote_code=True, use_cache=True, device=device, - ov_config=ov_options + ov_config=ov_options, ) return model +TEXT2IMAGE_TASK2CLASS = { + "sd": OVStableDiffusionPipeline, + "sd-xl": OVStableDiffusionXLPipeline, + "sd-lcm": OVLatentConsistencyModelPipeline, +} + + +def load_text2image_model( + model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if ov_config: + with open(ov_config) as f: + ov_options = json.load(f) + else: + ov_options = None + + if use_hf: + return DiffusionPipeline.from_pretrained(model_id, trust_remote_code=True) + + TEXT2IMAGEPipeline = TEXT2IMAGE_TASK2CLASS[model_type] + + try: + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_options + ) + except ValueError: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_options, + ) + return model + + +def load_model( + model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + from .registry import MODELTYPE2TASK + + if model_id is None: + return None + + if model_type == "text": + return load_text_model(model_id, device, ov_config, use_hf, use_genai) + elif MODELTYPE2TASK[model_type] == "image-generation": + return load_text2image_model( + model_type, model_id, device, ov_config, use_hf, use_genai + ) + else: + raise ValueError(f"Unsupported model type: {model_type}") + + def load_prompts(args): if args.dataset is None: return None @@ -93,7 +167,7 @@ def load_prompts(args): res = data[args.dataset_field] - res = {"questions": list(res)} + res = {"prompts": list(res)} return res @@ -127,7 +201,14 @@ def parse_args(): "I defined and not exists them will be generated by base_model evaluation.", ) parser.add_argument( - "--text-encoder", + "--model-type", + type=str, + choices=["text", "sd", "sd-xl", "sd-lcm"], + default="text", + help="Indicated the model type, e.g. 'text', 'sd'.", + ) + parser.add_argument( + "--data-encoder", type=str, default="sentence-transformers/all-mpnet-base-v2", help="Model for measurement of similarity between base_model and target_model." @@ -145,7 +226,7 @@ def parse_args(): parser.add_argument( "--dataset-field", type=str, - default="questions", + default="text", help="The name of field in dataset for prompts. For example question or context in squad." "Will be used only if dataset is defined.", ) @@ -258,44 +339,120 @@ def diff_strings(a: str, b: str, *, use_loguru_colors: bool = False) -> str: def genai_gen_answer(model, tokenizer, question, max_new_tokens, skip_question): - out = model.generate(question, max_new_tokens=max_new_tokens) + config = openvino_genai.GenerationConfig() + config.max_new_tokens = max_new_tokens + out = model.generate(question, config) return out +def get_evaluator(base_model, args): + # config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + # task = TasksManager.infer_task_from_model(config._name_or_path) + # TODO: Add logic to auto detect task based on model_id (TaskManager does not work for locally saved models) + task = MODELTYPE2TASK[args.model_type] + + try: + EvaluatorCLS = EVALUATOR_REGISTRY[task] + prompts = load_prompts(args) + + if task == "text-generation": + tokenizer = load_tokenizer(args) + return EvaluatorCLS( + base_model=base_model, + gt_data=args.gt_data, + test_data=prompts, + tokenizer=tokenizer, + similarity_model_id=args.data_encoder, + num_samples=args.num_samples, + language=args.language, + gen_answer_fn=genai_gen_answer if args.genai else None, + ) + elif task == "image-generation": + return EvaluatorCLS( + base_model=base_model, + gt_data=args.gt_data, + test_data=prompts, + num_samples=args.num_samples, + ) + else: + raise ValueError(f"Unsupported task: {task}") + + except KeyError: + raise ValueError( + f"Attempted to load evaluator for '{task}', but no evaluator for this model type found!" + "Supported model types: {', '.join(EVALUATOR_REGISTRY.keys())}" + ) + + +def print_text_results(evaluator): + metric_of_interest = "similarity" + worst_examples = evaluator.worst_examples(top_k=5, metric=metric_of_interest) + for i, e in enumerate(worst_examples): + ref_text = "" + actual_text = "" + diff = "" + for l1, l2 in zip( + e["source_model"].splitlines(), e["optimized_model"].splitlines() + ): + if l1 == "" and l2 == "": + continue + ref_text += l1 + "\n" + actual_text += l2 + "\n" + diff += diff_strings(l1, l2) + "\n" + + logger.info( + "--------------------------------------------------------------------------------------" + ) + logger.info("## Reference text %d:\n%s", i + 1, ref_text) + logger.info("## Actual text %d:\n%s", i + 1, actual_text) + logger.info("## Diff %d: ", i + 1) + logger.info(diff) + + +def print_image_results(evaluator): + metric_of_interest = "similarity" + worst_examples = evaluator.worst_examples(top_k=1, metric=metric_of_interest) + for i, e in enumerate(worst_examples): + logger.info( + "--------------------------------------------------------------------------------------" + ) + logger.info(f"Top-{i+1} example:") + logger.info(e) + + def main(): args = parse_args() check_args(args) - prompts = load_prompts(args) - tokenizer = load_tokenizer(args) if args.gt_data and os.path.exists(args.gt_data): - evaluator = Evaluator( - base_model=None, - gt_data=args.gt_data, - test_data=prompts, - tokenizer=tokenizer, - similarity_model_id=args.text_encoder, - num_samples=args.num_samples, - language=args.language, - ) + evaluator = get_evaluator(None, args) else: - base_model = load_model(args.base_model, args.device, args.ov_config, args.hf, args.genai) - evaluator = Evaluator( - base_model=base_model, - test_data=prompts, - tokenizer=tokenizer, - similarity_model_id=args.text_encoder, - num_samples=args.num_samples, - language=args.language, - gen_answer_fn=genai_gen_answer if args.genai else None + base_model = load_model( + args.model_type, + args.base_model, + args.device, + args.ov_config, + args.hf, + args.genai, ) + evaluator = get_evaluator(base_model, args) + if args.gt_data: evaluator.dump_gt(args.gt_data) del base_model if args.target_model: - target_model = load_model(args.target_model, args.device, args.ov_config, args.hf, args.genai) - all_metrics_per_question, all_metrics = evaluator.score(target_model, genai_gen_answer if args.genai else None) + target_model = load_model( + args.model_type, + args.target_model, + args.device, + args.ov_config, + args.hf, + args.genai, + ) + all_metrics_per_question, all_metrics = evaluator.score( + target_model, genai_gen_answer if args.genai else None + ) logger.info("Metrics for model: %s", args.target_model) logger.info(all_metrics) @@ -307,25 +464,11 @@ def main(): df = pd.DataFrame(all_metrics) df.to_csv(os.path.join(args.output, "metrics.csv")) - if args.verbose: - metric_of_interest = "similarity" - worst_examples = evaluator.worst_examples(top_k=5, metric=metric_of_interest) - for i, e in enumerate(worst_examples): - ref_text = "" - actual_text = "" - diff = "" - for l1, l2 in zip(e["source_model"].splitlines(), e["optimized_model"].splitlines()): - if l1 == "" and l2 == "": - continue - ref_text += l1 + "\n" - actual_text += l2 + "\n" - diff += diff_strings(l1, l2) + "\n" - - logger.info("--------------------------------------------------------------------------------------") - logger.info("## Reference text %d:\n%s", i + 1, ref_text) - logger.info("## Actual text %d:\n%s", i + 1, actual_text) - logger.info("## Diff %d: ", i + 1) - logger.info(diff) + if args.verbose and args.target_model is not None: + if args.model_type == "text": + print_text_results(evaluator) + elif "sd" in args.model_type: + print_image_results(evaluator) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index b7a23efa98..7be4478108 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ find_python3 = true build_args = ["--parallel", "--target", "py_generate_pipeline"] install_args = ["--strip"] install_components = ["wheel_genai"] +options = {"BUILD_TOKENIZERS" = "OFF"} [build-system] requires = [ diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 200887475a..2a8f26ff4d 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -14,7 +14,7 @@ add_subdirectory(cpp/visual_language_chat) add_subdirectory(cpp/speculative_decoding_lm) add_subdirectory(cpp/benchmark_genai) add_subdirectory(cpp/whisper_speech_recognition) -add_subdirectory(cpp/stable_diffusion) +add_subdirectory(cpp/text2image) install(FILES requirements.txt DESTINATION samples COMPONENT cpp_samples_genai) @@ -28,7 +28,7 @@ install(DIRECTORY # Don't install continuous_batching_accuracy and continuous_batching_benchmark because CB isn't ready. cpp/visual_language_chat cpp/whisper_speech_recognition - cpp/stable_diffusion + cpp/text2image cpp/lora_greedy_causal_lm DESTINATION samples/cpp COMPONENT cpp_samples_genai) @@ -38,6 +38,6 @@ install(DIRECTORY python/greedy_causal_lm python/multinomial_causal_lm python/whisper_speech_recognition - # python/stable_diffusion + # python/text2image DESTINATION samples/python COMPONENT cpp_samples_genai USE_SOURCE_PERMISSIONS) diff --git a/samples/cpp/stable_diffusion/README.md b/samples/cpp/stable_diffusion/README.md deleted file mode 100644 index 5e6bfd0f9d..0000000000 --- a/samples/cpp/stable_diffusion/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Stable Diffusion C++ Image Generation Pipeline - -This example showcases inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::Text2ImagePipeline` and uses a text prompt as input source. - -Users can change the sample code and play with the following generation parameters: - -- Change width or height of generated image -- Generate multiple images per prompt -- Adjust a number of inference steps -- Play with [guidance scale](https://huggingface.co/spaces/stabilityai/stable-diffusion/discussions/9) (read [more details](https://arxiv.org/abs/2207.12598)) -- (SD 1.x, 2.x only) Add negative prompt when guidance scale > 1 - -## Download and convert the models and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 -``` - -## Run - -`stable_diffusion ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'` - -### Examples - -Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` - - ![](./512x512.bmp) - -## Supported models - -Models can be downloaded from [OpenAI HiggingFace](https://huggingface.co/openai). This sample can run the following list of models, but not limitied to: - -- [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5) -- [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) -- [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) -- [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) -- [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) - -## Note - -- Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample: - -C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method. diff --git a/samples/cpp/stable_diffusion/main.cpp b/samples/cpp/stable_diffusion/main.cpp deleted file mode 100644 index 05fc7a2535..0000000000 --- a/samples/cpp/stable_diffusion/main.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "openvino/genai/text2image/pipeline.hpp" - -#include "imwrite.hpp" - -int32_t main(int32_t argc, char* argv[]) try { - OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); - - const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well - - ov::genai::Text2ImagePipeline pipe(models_path, device); - ov::Tensor image = pipe.generate(prompt, - ov::genai::width(512), - ov::genai::height(512), - ov::genai::num_inference_steps(20)); - - imwrite("image.bmp", image, true); - - return EXIT_SUCCESS; -} catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; -} catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; -} diff --git a/samples/cpp/stable_diffusion/512x512.bmp b/samples/cpp/text2image/512x512.bmp similarity index 100% rename from samples/cpp/stable_diffusion/512x512.bmp rename to samples/cpp/text2image/512x512.bmp diff --git a/samples/cpp/stable_diffusion/CMakeLists.txt b/samples/cpp/text2image/CMakeLists.txt similarity index 56% rename from samples/cpp/stable_diffusion/CMakeLists.txt rename to samples/cpp/text2image/CMakeLists.txt index a7a6f067b3..ca0f832f6d 100644 --- a/samples/cpp/stable_diffusion/CMakeLists.txt +++ b/samples/cpp/text2image/CMakeLists.txt @@ -8,7 +8,7 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) -# create executable +# create main sample executable add_executable(stable_diffusion ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp @@ -26,3 +26,22 @@ install(TARGETS stable_diffusion RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) + +# create LoRA sample executable + +add_executable(lora_stable_diffusion + ${CMAKE_CURRENT_SOURCE_DIR}/lora.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/imwrite.cpp) + +target_include_directories(lora_stable_diffusion PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(lora_stable_diffusion PRIVATE openvino::genai) + +set_target_properties(lora_stable_diffusion PROPERTIES + COMPILE_PDB_NAME lora_stable_diffusion + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS lora_stable_diffusion + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/text2image/README.md b/samples/cpp/text2image/README.md new file mode 100644 index 0000000000..f73da334f4 --- /dev/null +++ b/samples/cpp/text2image/README.md @@ -0,0 +1,78 @@ +# Text to Image C++ Generation Pipeline + +Examples in this folder showcase inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::Text2ImagePipeline` and uses a text prompt as input source. + +There are two sample files: + - [`main.cpp`](./main.cpp) demonstrates basic usage of the text to image pipeline + - [`lora.cpp`](./lora.cpp) shows how to apply LoRA adapters to the pipeline + +Users can change the sample code and play with the following generation parameters: + +- Change width or height of generated image +- Generate multiple images per prompt +- Adjust a number of inference steps +- Play with [guidance scale](https://huggingface.co/spaces/stabilityai/stable-diffusion/discussions/9) (read [more details](https://arxiv.org/abs/2207.12598)) +- (SD 1.x, 2.x only) Add negative prompt when guidance scale > 1 +- Apply multiple different LoRA adapters and mix them with different blending coefficients + +## Download and convert the models and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 +``` + +## Run + +`stable_diffusion ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'` + +### Examples + +Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` + + ![](./512x512.bmp) + +## Supported models + +Models can be downloaded from [HuggingFace](https://huggingface.co/models). This sample can run the following list of models, but not limited to: + +- [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5) +- [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) +- [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) +- [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) +- [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) +- [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) +- [stabilityai/stable-diffusion-xl-base-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9) + +## Run with optional LoRA adapters + +LoRA adapters can be connected to the pipeline and modify generated images to have certain style, details or quality. Adapters are supported in Safetensors format and can be downloaded from public sources like [Civitai](https://civitai.com) or [HuggingFace](https://huggingface.co/models) or trained by the user. Adapters compatible with a base model should be used only. A weighted blend of multiple adapters can be applied by specifying multple adapter files with corresponding alpha parameters in command line. Check `lora.cpp` source code to learn how to enable adapters and specify them in each `generate` call. + +Here is an example how to run the sample with a single adapter. First download adapter file from https://civitai.com/models/67927/soulcard page manually and save it as `soulcard.safetensors`. Or download it from command line: + +`wget -O soulcard.safetensors https://civitai.com/api/download/models/72591` + +Then run `lora_stable_diffusion` executable: + +`./lora_stable_diffusion dreamlike_anime_1_0_ov/FP16 'curly-haired unicorn in the forest, anime, line' soulcard.safetensors 0.7` + +The sample generates two images with and without adapters applied using the same prompt: + - `lora.bmp` with adapters applied + - `baseline.bmp` without adapters applied + +Check the difference: + +With adapter | Without adapter +:---:|:---: +![](./lora.bmp) | ![](./baseline.bmp) + + +## Note + +- Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample: + +C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method. diff --git a/samples/cpp/text2image/baseline.bmp b/samples/cpp/text2image/baseline.bmp new file mode 100644 index 0000000000..aa9a51ccf6 Binary files /dev/null and b/samples/cpp/text2image/baseline.bmp differ diff --git a/samples/cpp/stable_diffusion/imwrite.cpp b/samples/cpp/text2image/imwrite.cpp similarity index 100% rename from samples/cpp/stable_diffusion/imwrite.cpp rename to samples/cpp/text2image/imwrite.cpp diff --git a/samples/cpp/stable_diffusion/imwrite.hpp b/samples/cpp/text2image/imwrite.hpp similarity index 100% rename from samples/cpp/stable_diffusion/imwrite.hpp rename to samples/cpp/text2image/imwrite.hpp diff --git a/samples/cpp/text2image/lora.bmp b/samples/cpp/text2image/lora.bmp new file mode 100644 index 0000000000..62859e4bdd Binary files /dev/null and b/samples/cpp/text2image/lora.bmp differ diff --git a/samples/cpp/text2image/lora.cpp b/samples/cpp/text2image/lora.cpp new file mode 100644 index 0000000000..0db7b55fe9 --- /dev/null +++ b/samples/cpp/text2image/lora.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/pipeline.hpp" + +#include "imwrite.hpp" + +int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " '' [ ...]]"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::AdapterConfig adapter_config; + // Multiple LoRA adapters applied simultaniously are supported, parse them all and corresponding alphas from cmd parameters: + for(size_t i = 0; i < (argc - 3)/2; ++i) { + ov::genai::Adapter adapter(argv[3 + 2*i]); + float alpha = std::atof(argv[3 + 2*i + 1]); + adapter_config.add(adapter, alpha); + } + + // LoRA adapters passed to the constructor will be activated by default in next generates + ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); + + std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; + ov::Tensor image = pipe.generate(prompt, + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("lora.bmp", image, true); + + std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; + image = pipe.generate(prompt, + ov::genai::adapters(), // passing adapters in generate overrides adapters set in the constructor; adapters() means no adapters + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("baseline.bmp", image, true); + + return EXIT_SUCCESS; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/text2image/main.cpp b/samples/cpp/text2image/main.cpp new file mode 100644 index 0000000000..02c632d53e --- /dev/null +++ b/samples/cpp/text2image/main.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/pipeline.hpp" + +#include "imwrite.hpp" + +namespace { + + void imwrite_output_imgs(const ov::Tensor& output) { + ov::Shape out_shape = output.get_shape(); + + if (out_shape[0] == 1) { + imwrite("image.bmp", output, true); + return; + } + + ov::Shape img_shape = {1, out_shape[1], out_shape[2], out_shape[3]}; + size_t img_size = output.get_size() / out_shape[0]; + + ov::Tensor image(output.get_element_type(), img_shape); + uint8_t* out_data = output.data(); + uint8_t* img_data = image.data(); + + for (int img_num = 0; img_num < out_shape[0]; ++img_num) { + std::memcpy(img_data, out_data + img_size * img_num, img_size * sizeof(uint8_t)); + + char img_name[25]; + sprintf(img_name, "image_%d.bmp", img_num); + + imwrite(img_name, image, true); + } + } + +} //namespace + +int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::Text2ImagePipeline pipe(models_path, device); + ov::Tensor image = pipe.generate(prompt, + ov::genai::width(512), + ov::genai::height(512), + ov::genai::num_inference_steps(20), + ov::genai::num_images_per_prompt(1)); + + imwrite_output_imgs(image); + + return EXIT_SUCCESS; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md index 49d4545850..b9d0ebcfe4 100644 --- a/samples/cpp/visual_language_chat/README.md +++ b/samples/cpp/visual_language_chat/README.md @@ -15,7 +15,7 @@ export_MiniCPM-V-2_6.py miniCPM-V-2_6 ## Run -https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 can be used as a sample image. +[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. `visual_language_chat miniCPM-V-2_6 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index 838a1be78e..b9af689fce 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -26,12 +26,10 @@ int main(int argc, char* argv[]) try { pipe.start_chat(); std::cout << "question:\n"; - if (!std::getline(std::cin, prompt)) { - throw std::runtime_error("std::cin failed"); - } + std::getline(std::cin, prompt); pipe.generate( prompt, - ov::genai::image(std::move(image)), + ov::genai::image(image), ov::genai::streamer(print_subword) ); std::cout << "\n----------\n" diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md index ab5a76c70a..fec5d9194f 100644 --- a/samples/cpp/whisper_speech_recognition/README.md +++ b/samples/cpp/whisper_speech_recognition/README.md @@ -23,7 +23,7 @@ Prepare audio file in wav format with sampling rate 16k Hz. Output: text transcription of `sample.wav` -Models can be downloaded from [OpenAI HiggingFace](https://huggingface.co/openai). +Models can be downloaded from [OpenAI HuggingFace](https://huggingface.co/openai). Supported Models: [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) diff --git a/samples/python/vlm_chat_sample/README.md b/samples/python/vlm_chat_sample/README.md new file mode 100644 index 0000000000..246cbe3cd8 --- /dev/null +++ b/samples/python/vlm_chat_sample/README.md @@ -0,0 +1,38 @@ +# Python vlm_chat_sample that supports VLM models + +This example showcases inference of text-generation Vision Language Models (VLMs): `miniCPM-V-2_6` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.VLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +``` +# TODO: add optimum cli command for miniCPM-V-2_6 when available + +## Run: +[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. + +`vlm_chat_sample.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. # TODO: examples of larger models +Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/vlm_chat_sample/vlm_chat_sample.py b/samples/python/vlm_chat_sample/vlm_chat_sample.py new file mode 100644 index 0000000000..686fae939f --- /dev/null +++ b/samples/python/vlm_chat_sample/vlm_chat_sample.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse + +import numpy as np +import openvino_genai +from PIL import Image +from openvino import Tensor + + +def streamer(subword: str) -> bool: + ''' + + Args: + subword: sub-word of the generated text. + + Returns: Return flag corresponds whether generation should be stopped. + + ''' + print(subword, end='', flush=True) + + # No value is returned as in this example we don't want to stop the generation in this method. + # "return None" will be treated the same as "return False". + + +def read_image(path: str) -> Tensor: + ''' + + Args: + path: The path to the image. + + Returns: the ov.Tensor containing the image. + + ''' + pic = Image.open(path) + image_data = np.array(pic.getdata()).reshape(1, 3, pic.size[1], pic.size[0]).astype(np.byte) + return Tensor(image_data) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('image_dir') + args = parser.parse_args() + + image = read_image(args.image_dir) + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.VLMPipeline(args.model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + while True: + try: + prompt = input('question:\n') + except EOFError: + break + pipe(prompt, image=image, generation_config=config, streamer=streamer) + print('\n----------') + pipe.finish_chat() + + +if '__main__' == __name__: + main() diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md index ab5a76c70a..fec5d9194f 100644 --- a/samples/python/whisper_speech_recognition/README.md +++ b/samples/python/whisper_speech_recognition/README.md @@ -23,7 +23,7 @@ Prepare audio file in wav format with sampling rate 16k Hz. Output: text transcription of `sample.wav` -Models can be downloaded from [OpenAI HiggingFace](https://huggingface.co/openai). +Models can be downloaded from [OpenAI HuggingFace](https://huggingface.co/openai). Supported Models: [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) diff --git a/samples/requirements.txt b/samples/requirements.txt index b8cc30895e..4821d6dbef 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -3,4 +3,4 @@ optimum[openvino]==1.22.0 einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diffusers==0.30.3 -torchvision +torchvision # needed for mini-CPM export script. Need to remove when we switch to exporting with optimum-intel. diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index da4540bb90..20b547052c 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -51,7 +51,9 @@ file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_ set(TARGET_NAME openvino_genai) add_library(${TARGET_NAME} SHARED ${SOURCE_FILES}) -add_dependencies(${TARGET_NAME} openvino_tokenizers) +if(TARGET openvino_tokenizers) + add_dependencies(${TARGET_NAME} openvino_tokenizers) +endif() add_library(openvino::genai ALIAS ${TARGET_NAME}) target_include_directories(${TARGET_NAME} diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 5d6bce880f..a1244d3d75 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -161,8 +161,6 @@ static constexpr ov::Property presence_penalty{"presence_penalty"}; static constexpr ov::Property frequency_penalty{"frequency_penalty"}; static constexpr ov::Property rng_seed{"rng_seed"}; -static constexpr AdaptersProperty adapters; - // Predefined Configs OPENVINO_GENAI_EXPORTS GenerationConfig beam_search(); OPENVINO_GENAI_EXPORTS GenerationConfig greedy(); diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index a14dd1dde0..b21fb43bdb 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -270,7 +270,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { }; OPENVINO_GENAI_EXPORTS std::pair streamer(StreamerVariant func); -std::pair generation_config(const GenerationConfig& config); +OPENVINO_GENAI_EXPORTS std::pair generation_config(const GenerationConfig& config); } // namespace genai } // namespace ov diff --git a/src/cpp/include/openvino/genai/lora_adapter.hpp b/src/cpp/include/openvino/genai/lora_adapter.hpp index 388ccdb941..5748abb807 100644 --- a/src/cpp/include/openvino/genai/lora_adapter.hpp +++ b/src/cpp/include/openvino/genai/lora_adapter.hpp @@ -92,7 +92,9 @@ struct OPENVINO_GENAI_EXPORTS AdapterConfig { class AdaptersProperty : public ov::Property { public: - constexpr AdaptersProperty() : ov::Property("adapters") {} + inline constexpr static const char* name () { return "adapters"; } + + constexpr AdaptersProperty() : ov::Property(name()) {} inline std::pair operator()(const AdapterConfig& config) const { return ov::Property::operator()(config); @@ -154,6 +156,9 @@ class AdaptersProperty : public ov::Property { }; +static constexpr AdaptersProperty adapters; + + class OPENVINO_GENAI_EXPORTS AdapterController { std::shared_ptr m_pimpl; @@ -165,15 +170,12 @@ class OPENVINO_GENAI_EXPORTS AdapterController { AdapterController(std::shared_ptr model, const AdapterConfig& config, const std::string& prefix, std::string device = ""); - // Call it every time when adapter config is changed; if adapter is configured as a static one, this call is not required - void apply(ov::InferRequest& request, const AdapterConfig& config); + // Apply adapters configured in the current config set last time, or set and use new config given as optional `config` argument + void apply(ov::InferRequest& request, const std::optional& config = std::nullopt); // the next call of apply will set all adapter tensors regardless of config change, use this method if full state.reset is called for the controlled model void force_full_apply(bool full_apply = true); - // Apply the same config that was used last time (in initialization or in previous call to apply). - void apply(ov::InferRequest& request); - operator bool() const { return bool(m_pimpl); } diff --git a/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp b/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp index 4f348156c2..1f79b039d7 100644 --- a/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp +++ b/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp @@ -7,6 +7,7 @@ #include "openvino/genai/visibility.hpp" #include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/lora_adapter.hpp" #include "openvino/core/any.hpp" #include "openvino/runtime/tensor.hpp" @@ -21,6 +22,7 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel { struct Config { size_t max_position_embeddings = 77; size_t hidden_size = 512; + size_t num_hidden_layers = 13; explicit Config(const std::string& config_path); }; @@ -53,10 +55,15 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel { return compile(device, ov::AnyMap{std::forward(properties)...}); } + void set_adapters(const AdapterConfig& adapters); + ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance); + ov::Tensor get_output_tensor(const size_t idx); + private: Config m_config; + AdapterController m_adapter_controller; ov::InferRequest m_request; std::shared_ptr m_model; diff --git a/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp new file mode 100644 index 0000000000..e46e76f316 --- /dev/null +++ b/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "openvino/genai/visibility.hpp" +#include "openvino/genai/tokenizer.hpp" + +#include "openvino/core/any.hpp" +#include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/properties.hpp" + +namespace ov { +namespace genai { + +class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection { +public: + struct Config { + size_t max_position_embeddings = 77; + size_t hidden_size = 512; + size_t num_hidden_layers = 33; + + explicit Config(const std::string& config_path); + }; + + explicit CLIPTextModelWithProjection(const std::string root_dir); + + CLIPTextModelWithProjection(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties = {}); + + template ::value, bool>::type = true> + CLIPTextModelWithProjection(const std::string& root_dir, + const std::string& device, + Properties&&... properties) + : CLIPTextModelWithProjection(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } + + CLIPTextModelWithProjection(const CLIPTextModelWithProjection&); + + const Config& get_config() const; + + CLIPTextModelWithProjection& reshape(int batch_size); + + CLIPTextModelWithProjection& compile(const std::string& device, const ov::AnyMap& properties = {}); + + template + ov::util::EnableIfAllStringAny compile( + const std::string& device, + Properties&&... properties) { + return compile(device, ov::AnyMap{std::forward(properties)...}); + } + + ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance); + + ov::Tensor get_output_tensor(const size_t idx); + +private: + Config m_config; + ov::InferRequest m_request; + std::shared_ptr m_model; + + Tokenizer m_clip_tokenizer; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/text2image/pipeline.hpp b/src/cpp/include/openvino/genai/text2image/pipeline.hpp index 952e775f2d..5ce6a08b11 100644 --- a/src/cpp/include/openvino/genai/text2image/pipeline.hpp +++ b/src/cpp/include/openvino/genai/text2image/pipeline.hpp @@ -13,7 +13,9 @@ #include "openvino/genai/visibility.hpp" +#include "openvino/genai/lora_adapter.hpp" #include "openvino/genai/text2image/clip_text_model.hpp" +#include "openvino/genai/text2image/clip_text_model_with_projection.hpp" #include "openvino/genai/text2image/unet2d_condition_model.hpp" #include "openvino/genai/text2image/autoencoder_kl.hpp" @@ -53,7 +55,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { AUTO, LCM, LMS_DISCRETE, - DDIM + DDIM, + EULER_DISCRETE }; static std::shared_ptr from_config(const std::string& scheduler_config_path, @@ -81,6 +84,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { int64_t width = -1; size_t num_inference_steps = 50; + AdapterConfig adapters; + void update_generation_config(const ov::AnyMap& config_map); // checks whether is config is valid @@ -96,6 +101,13 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { Text2ImagePipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties = {}); + template ::value, bool>::type = true> + Text2ImagePipeline(const std::string& root_dir, + const std::string& device, + Properties&&... properties) + : Text2ImagePipeline(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } + // creates either LCM or SD pipeline from building blocks static Text2ImagePipeline stable_diffusion( const std::shared_ptr& scheduler_type, @@ -110,6 +122,14 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { const UNet2DConditionModel& unet, const AutoencoderKL& vae_decoder); + // creates SDXL pipeline from building blocks + static Text2ImagePipeline stable_diffusion_xl( + const std::shared_ptr& scheduler_type, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder); + GenerationConfig get_generation_config() const; void set_generation_config(const GenerationConfig& generation_config); @@ -138,6 +158,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { explicit Text2ImagePipeline(const std::shared_ptr& impl); class StableDiffusionPipeline; + class StableDiffusionXLPipeline; }; // diff --git a/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp b/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp index b5b5288049..b3cfe1d364 100644 --- a/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp +++ b/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp @@ -14,6 +14,7 @@ #include "openvino/runtime/tensor.hpp" #include "openvino/runtime/infer_request.hpp" #include "openvino/runtime/properties.hpp" +#include "openvino/genai/lora_adapter.hpp" namespace ov { namespace genai { @@ -61,10 +62,13 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel { void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states); + void set_adapters(const AdapterConfig& adapters); + ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep); private: Config m_config; + AdapterController m_adapter_controller; std::shared_ptr m_model; ov::InferRequest m_request; size_t m_vae_scale_factor; diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp index 7370b7f8aa..474216736c 100644 --- a/src/cpp/include/openvino/genai/vision_encoder.hpp +++ b/src/cpp/include/openvino/genai/vision_encoder.hpp @@ -8,7 +8,7 @@ namespace ov::genai { /// @brief A pair describing image size. -struct HeightWidth { +struct ImageSize { /// @brief Height of a corresponding image. size_t height; /// @brief Width of a corresponding image. @@ -25,16 +25,16 @@ struct EncodedImage { ov::Tensor resized_source; /// @brief A size of an image used to compute embeddings for /// divided by ProcessorConfig's patch_size. - HeightWidth resized_source_size; + ImageSize resized_source_size; /// @brief Embeddings of images obtained from a source image by /// slicing at no more than max_slice_nums pieces and resizing. /// The tensor's shape is /// [slice_y, slice_x, number_of_embeddings, embedding_size]. /// slices_sizes.size() == slice_y * slice_x. ov::Tensor slices; - /// @brief Flattened sizes of images used to compute embeddings + /// @brief A size of images used to compute embeddings /// stored in slices member divided by ProcessorConfig's patch_size. - std::vector slices_sizes; + ImageSize slices_size; }; /// @brief A class used to infer embeddings of an image using diff --git a/src/cpp/include/openvino/genai/vlm_pipeline.hpp b/src/cpp/include/openvino/genai/vlm_pipeline.hpp index 85ea9dd661..38595f1b96 100644 --- a/src/cpp/include/openvino/genai/vlm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/vlm_pipeline.hpp @@ -65,37 +65,14 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { explicit VLMPipeline( const std::filesystem::path& model_dir, const std::string& device="CPU", - const ov::AnyMap device_config={}, - ov::Core core=ov::Core{} - ) : VLMPipeline{ - model_dir, - Tokenizer(model_dir.string(), device_config), - device, - device_config, - core - } {} - - /// @brief Construct a pipeline form a folder containing model IRs - /// and from a Tokenizer instance. - /// @param model_dir A folder to read model IRs. - /// @param tokenizer An instance of Tokenizer to use. - /// @param device Inference device. - /// @param device_config A config to pass to ov::Core.set_property() - /// and ov::Core::compile_model(). - /// @param core ov::Core instance to use. - VLMPipeline( - const std::filesystem::path& model_dir, - const ov::genai::Tokenizer& tokenizer, - const std::string& device="CPU", - const ov::AnyMap device_config={}, - ov::Core core=ov::Core{} + const ov::AnyMap device_config={} ); /// @brief Default destructor. ~VLMPipeline(); /// @brief Generate a response given a prompt and any number of - /// uint8 RGB images. + /// uint8 RGB images with [NCHW] or [CHW] layout. /// @param prompt A prompt to respond to. /// @param images Images to be prepended to a prompt. /// @param generation_config A config to follow for text generation. @@ -120,7 +97,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @brief Generate a response given a prompt and arbitrary number /// of ov::Property instances. /// Example: - /// generate("text", image(std::move(rgb)), do_sample(true)); + /// generate("text", image(rgb), do_sample(true)); /// @param prompt A prompt to respond to. /// @param ...properties ov::Property instances to be combined into /// ov::AnyMap. @@ -166,7 +143,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /* * utils that allow to use generate() in the following way: - * pipe.generate(prompt, ov::genai::image(std::move(image_tensor))). + * pipe.generate(prompt, ov::genai::image(image_tensor)). */ static constexpr ov::Property image{"image"}; static constexpr ov::Property> images{"images"}; diff --git a/src/cpp/src/clip.cpp b/src/cpp/src/clip.cpp index bb416519bb..93adc26eb2 100644 --- a/src/cpp/src/clip.cpp +++ b/src/cpp/src/clip.cpp @@ -6,9 +6,6 @@ // I'll gradually clean and extend it // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch -#define STB_IMAGE_IMPLEMENTATION -#include "stb_image.hpp" - #include #include #include diff --git a/src/cpp/src/clip.hpp b/src/cpp/src/clip.hpp index 99c06a05d2..bce6cc8970 100644 --- a/src/cpp/src/clip.hpp +++ b/src/cpp/src/clip.hpp @@ -1,8 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#ifndef CLIP_H -#define CLIP_H +#pragma once #include #include @@ -53,4 +52,3 @@ bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_wid /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img); -#endif // CLIP_H diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 94a05dd587..ff7ceb051e 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -16,6 +16,7 @@ #include "utils.hpp" #include "text_callback_streamer.hpp" #include "openvino/genai/lora_adapter.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -76,12 +77,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(model_path)) { ov::Core core; - auto adapters_iter = plugin_config.find(ov::genai::adapters.name()); - if (adapters_iter != plugin_config.end()) { - m_generation_config.adapters = adapters_iter->second.as(); - auto filtered_plugin_config = plugin_config; - filtered_plugin_config.erase(ov::genai::adapters.name()); - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(filtered_plugin_config); + if(auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { + auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config); core.set_property(core_plugin_config); auto model = core.read_model(model_path / "openvino_model.xml"); m_adapter_controller = AdapterController(model, m_generation_config.adapters, "base_model.model.model.", device); // TODO: Make the prefix name configurable diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 0c75ad30b4..e330693c5d 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -52,6 +52,36 @@ void align_u4_zp_constants(const std::shared_ptr& model) { } } +bool allow_to_enable_npuw_dq(const std::shared_ptr& model) { + std::vector rt_info_path = {"nncf", "weight_compression", "group_size"}; + if (!model->has_rt_info(rt_info_path)) { + // NB: Model isn't compressed by NNCF - skip + return false; + } + auto group_size = model->get_rt_info(rt_info_path); + if (group_size == -1) { + // NB: Enable DQ for CW quantized models + return true; + } + return false; +} + +std::optional pop_option(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + config.erase(it); + return std::make_optional(it->second); + } + return std::nullopt; +} + +void enable_npuw_dq_if_allowed(ov::AnyMap& config, + const std::shared_ptr& model) { + if (allow_to_enable_npuw_dq(model)) { + config["NPUW_DQ"] = "YES"; + pop_option(config, "NPUW_ONLINE_AVOID"); + } +} + std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { const auto kStartOutputKVCacheLayers = 1u; for (int i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { @@ -182,19 +212,22 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { } } -ov::AnyMap get_default_prefill_config() { - std::map config = { +ov::AnyMap get_default_prefill_config(const std::shared_ptr& model) { + ov::AnyMap config = { { "NPU_USE_NPUW", "YES" }, { "NPUW_FOLD", "YES" }, { "NPUW_DCOFF_TYPE", "f16" }, { "NPUW_DCOFF_SCALE", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" }, { "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" } }; - return { config.begin(), config.end() }; + enable_npuw_dq_if_allowed(config, model); + return config; } -ov::AnyMap get_default_generate_config() { - std::map config = { +ov::AnyMap get_default_generate_config(const std::shared_ptr& model) { + ov::AnyMap config = { { "NPU_USE_NPUW", "YES" }, { "NPUW_FOLD", "YES" }, { "NPUW_DCOFF_TYPE", "f16" }, @@ -202,17 +235,18 @@ ov::AnyMap get_default_generate_config() { { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" }, { "NPUW_PARALLEL_COMPILE", "YES" }, { "NPUW_FUNCALL_ASYNC", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, { "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" } }; - return { config.begin(), config.end() }; + enable_npuw_dq_if_allowed(config, model); + return config; } template T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { - if (auto it = config.find(key); it != config.end()) { - auto value = it->second; - config.erase(it); - return value.as(); + auto anyopt = pop_option(config, key); + if (anyopt.has_value()) { + return anyopt.value().as(); } return default_value; } @@ -227,9 +261,7 @@ ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, si void drop_cache_dir(ov::AnyMap& config) { if (config.count("NPU_USE_NPUW") != 0u) { - if (auto it = config.find("CACHE_DIR"); it != config.end()) { - config.erase(it); - } + pop_option(config, "CACHE_DIR"); } } @@ -312,13 +344,18 @@ void StaticLLMPipeline::setupAndCompileModels( reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); // (8) Compile both model - auto prefill_config = pop_or_default(pipeline_config, "PREFILL_CONFIG", get_default_prefill_config()); - auto generate_config = pop_or_default(pipeline_config, "GENERATE_CONFIG", get_default_generate_config()); + auto prefill_config = pop_or_default( + pipeline_config, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model) + ); + auto generate_config = pop_or_default( + pipeline_config, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model) + ); merge_config_with(prefill_config, pipeline_config); merge_config_with(generate_config, pipeline_config); // FIXME: Drop CACHE_DIR option if NPUW is enabled drop_cache_dir(prefill_config); drop_cache_dir(generate_config); + m_prefill_request = core.compile_model( m_prefill_model, device, prefill_config ).create_infer_request(); @@ -342,7 +379,7 @@ void StaticLLMPipeline::setupAndImportModels( */ ov::Core core; - auto import_blob = [this, + auto import_blob = [this, &path, &pipeline_config, &core, @@ -397,8 +434,8 @@ void StaticLLMPipeline::setupAndImportModels( // (4) Fill in m_kvcache_desc const uint32_t kMaxPromptLen = get_kvcache_size(prefill_model); const uint32_t kMinResponseLen = get_kvcache_size(generate_model) - kMaxPromptLen; - // FIXME For some models KV-cache dim != 2u - m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, 2u }; + // FIXME For some models KV-cache dim != 2u + m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, 2u }; } void StaticLLMPipeline::start_chat(const std::string& system_message) { diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp index 216fca98a4..2bfd1d5ca1 100644 --- a/src/cpp/src/lora_adapter.cpp +++ b/src/cpp/src/lora_adapter.cpp @@ -491,7 +491,7 @@ class LoRATransformBase : public ov::pass::MatcherPass { // Builds LoRA subgraph that consists of several matrix and element-wise multiplications with optional data type conversions and reshapes // to build a consistent graph. -NodePtr tensors_multiplication(NodePtr input, const NodeVector multipliers, ov::Output target, bool transpose_weights, size_t alpha_pos) { +NodePtr tensors_multiplication(NodePtr input, const NodeVector multipliers, ov::Output target, bool transpose_weights, size_t alpha_pos, bool transpose_in_end) { const auto target_type = target.get_element_type(); const auto target_shape = target.get_partial_shape(); const auto target_rank = target_shape.rank().get_length(); @@ -516,7 +516,7 @@ NodePtr tensors_multiplication(NodePtr input, const NodeVector multipliers, ov:: } } - if(target_rank == 4 && target_shape[-1].is_static() && target_shape[-1].get_length() > 1) { // FIXME: Check all potentially permuted dimensions, not only the last one + if(transpose_in_end) { // FIXME: Check the dimensions we really need to move, currently it is hardcoded 2 + 2 dimensions that usually appears in 2D Convolution case // where we need to apply LoRA for the first two dimensions (channels) while interpreting two last dimensions (spatial ) // TODO: Stash transposition constant to reuse @@ -648,7 +648,7 @@ class LoRAFuseTransform : public LoRATransformBase { for(auto multiplier : adapter) { parameters.push_back(std::make_shared(multiplier->get_output_element_type(0), multiplier->get_output_partial_shape(0))); } - auto result = std::make_shared(tensors_multiplication(nullptr, NodeVector{parameters.begin() + 1, parameters.end()}, target, false, 1)); + auto result = std::make_shared(tensors_multiplication(nullptr, NodeVector{parameters.begin() + 1, parameters.end()}, target, false, 1, false)); auto weights_model = std::make_shared(ov::ResultVector{result}, parameters); fusers.insert(signature, weights_model); } @@ -699,6 +699,7 @@ class LoRASeparateTransform : public LoRATransformBase { auto target_rank = target.get_partial_shape().rank().get_length(); auto consumers = target.get_target_inputs(); + bool transpose_in_end = false; // FIXME: Should check rank of activations instead of target rank if(target_rank == 4 && target.get_partial_shape()[target_rank - 3].get_length() > 1) { @@ -707,10 +708,11 @@ class LoRASeparateTransform : public LoRATransformBase { auto transposition = v0::Constant::create(ov::element::i32, ov::Shape{4}, std::vector{2, 3, 0, 1}); auto transpose = register_new_node(activations, transposition); activations = transpose; + transpose_in_end = true; } NodeVector lora_variables{lora_weight.A, lora_weight.alpha, lora_weight.B}; - replacement = tensors_multiplication(activations.get_node_shared_ptr(), lora_variables, target, true, 1); + replacement = tensors_multiplication(activations.get_node_shared_ptr(), lora_variables, target, true, 1, transpose_in_end); for (auto consumer : consumers) { consumer.replace_source_output(replacement->output(0)); @@ -843,7 +845,7 @@ struct AdapterControllerImpl { } struct ConfigChanged { - bool mode; + bool mode = false; bool alpha = false; bool adapter = false; @@ -872,25 +874,28 @@ struct AdapterControllerImpl { return diff; } - void apply (ov::InferRequest& infer_request, const AdapterConfig& config) { + void apply (ov::InferRequest& infer_request, std::optional config) { // FIXME: If a part of LoRA state tensors are not set here, then need to carefully reset state in LLMPipeline where global reset is called after the generation - - const auto diff = compare_configs(current_config, config); - OPENVINO_ASSERT( - !diff.mode || config.get_mode() == AdapterConfig::MODE_AUTO, // MODE_AUTO in this call means that mode is not changed - "AdapterConfig::mode cannot be changed and should be configured once for a model at the initialization"); - OPENVINO_ASSERT( - config.get_mode() == AdapterConfig::MODE_AUTO || config.get_mode() == AdapterConfig::MODE_DYNAMIC || config.get_mode() == AdapterConfig::MODE_STATIC_RANK || (!diff.alpha && !diff.adapter), - "Cannot change adapters and/or the alphas when not one of the dynamic modes are used."); + ConfigChanged diff; + if(config) { + diff = compare_configs(current_config, *config); + OPENVINO_ASSERT( + !diff.mode || config->get_mode() == AdapterConfig::MODE_AUTO, // MODE_AUTO in this call means that mode is not changed + "AdapterConfig::mode cannot be changed and should be configured once for a model at the initialization"); + OPENVINO_ASSERT( + config->get_mode() == AdapterConfig::MODE_AUTO || config->get_mode() == AdapterConfig::MODE_DYNAMIC || config->get_mode() == AdapterConfig::MODE_STATIC_RANK || (!diff.alpha && !diff.adapter), + "Cannot change adapters and/or the alphas when not one of the dynamic modes are used."); + current_config = *config; + } if(need_full_apply) { need_full_apply = false; - set_new_adapter_tensors(infer_request, config); + set_new_adapter_tensors(infer_request); } else if(diff) { if(diff.adapter) { - set_new_adapter_tensors(infer_request, config); + set_new_adapter_tensors(infer_request); } else { OPENVINO_ASSERT(diff.alpha); - set_new_adapter_alphas(infer_request, config); + set_new_adapter_alphas(infer_request); } } } @@ -899,13 +904,12 @@ struct AdapterControllerImpl { need_full_apply = full_apply; } - void set_new_adapter_alphas (ov::InferRequest& infer_request, const AdapterConfig& config) { + void set_new_adapter_alphas (ov::InferRequest& infer_request) { // FIXME: Provide more economical way to update only alphas - set_new_adapter_tensors(infer_request, config); + set_new_adapter_tensors(infer_request); } - void set_new_adapter_tensors (ov::InferRequest& infer_request, const AdapterConfig& config) { - current_config = config; // FIXME: Keep the old config to map to cached LoRA state tensors instead of the current approach where we start from scratch each time + void set_new_adapter_tensors (ov::InferRequest& infer_request) { if(current_config.get_mode() != AdapterConfig::MODE_AUTO && current_config.get_mode() != AdapterConfig::MODE_DYNAMIC && current_config.get_mode() != AdapterConfig::MODE_STATIC_RANK ) { return; } @@ -1163,10 +1167,6 @@ struct AdapterControllerImpl { } return new_tensors; } - - void apply (ov::InferRequest& infer_request) { - return apply(infer_request, current_config); - } }; @@ -1207,13 +1207,13 @@ AdapterController::AdapterController(std::shared_ptr model, const Ada // Call it every time when adapter config is changed; if adapter was configured as a static one, this call is not required -void AdapterController::apply(ov::InferRequest& request, const AdapterConfig& config) { - return m_pimpl->apply(request, config); -} - - -void AdapterController::apply(ov::InferRequest& request){ - return m_pimpl->apply(request); +void AdapterController::apply(ov::InferRequest& request, const std::optional& config) { + OPENVINO_ASSERT(m_pimpl || !config || !*config, + "Adapters are passed to AdapterController but it was not configured to use adapters. " + "Enable using adapters by pass them in the constructor first."); + if (m_pimpl) { + m_pimpl->apply(request, config); + } } diff --git a/src/cpp/src/lora_helper.cpp b/src/cpp/src/lora_helper.cpp new file mode 100644 index 0000000000..7e7a6e613c --- /dev/null +++ b/src/cpp/src/lora_helper.cpp @@ -0,0 +1,28 @@ +#include "lora_helper.hpp" + + +namespace ov { +namespace genai { + +std::optional extract_adapters_from_properties (const AnyMap& properties, AdapterConfig* adapter_config) { + auto adapters_iter = properties.find(AdaptersProperty::name()); + if (adapters_iter != properties.end()) { + if(adapter_config) { + *adapter_config = adapters_iter->second.as(); + } + auto filtered_properties = properties; + filtered_properties.erase(AdaptersProperty::name()); + return filtered_properties; + } + return std::nullopt; +} + +void update_adapters_from_properties (const AnyMap& properties, AdapterConfig& adapter_config) { + auto adapters_iter = properties.find(AdaptersProperty::name()); + if (adapters_iter != properties.end()) { + adapter_config = adapters_iter->second.as(); + } +} + +} +} \ No newline at end of file diff --git a/src/cpp/src/lora_helper.hpp b/src/cpp/src/lora_helper.hpp new file mode 100644 index 0000000000..b9e41e8b4c --- /dev/null +++ b/src/cpp/src/lora_helper.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "openvino/genai/lora_adapter.hpp" + + +namespace ov { +namespace genai { + +// Search for `adapters` property in `properties` map. If it is found and `adapter_config` is not nullptr, +// set `adapter_config` with found value, and return a copy of `properties` with the `adapters` property removed. +// If there is no `adapters` property, `adapter_config` is left unchanged and std::nullopt is returned. +std::optional extract_adapters_from_properties (const AnyMap& properties, AdapterConfig* adapter_config = nullptr); + +// Search for `adapters` property in `properties` map. If it is found, set `adapter_config` with found value. +// If `adapters` property is not found, do nothing. +void update_adapters_from_properties (const AnyMap& properties, AdapterConfig& adapter_config); + +} +} \ No newline at end of file diff --git a/src/cpp/src/text2image/diffusion_pipeline.hpp b/src/cpp/src/text2image/diffusion_pipeline.hpp index 41dce0e030..1884df4ca6 100644 --- a/src/cpp/src/text2image/diffusion_pipeline.hpp +++ b/src/cpp/src/text2image/diffusion_pipeline.hpp @@ -1,6 +1,8 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include "text2image/schedulers/ischeduler.hpp" diff --git a/src/cpp/src/text2image/models/autoencoder_kl.cpp b/src/cpp/src/text2image/models/autoencoder_kl.cpp index e9aec9528f..30b378963b 100644 --- a/src/cpp/src/text2image/models/autoencoder_kl.cpp +++ b/src/cpp/src/text2image/models/autoencoder_kl.cpp @@ -14,6 +14,7 @@ #include "openvino/op/constant.hpp" #include "utils.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -43,7 +44,11 @@ AutoencoderKL::AutoencoderKL(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) : AutoencoderKL(root_dir) { - compile(device, properties); + if(auto filtered_properties = extract_adapters_from_properties(properties)) { + compile(device, *filtered_properties); + } else { + compile(device, properties); + } } AutoencoderKL::AutoencoderKL(const AutoencoderKL&) = default; diff --git a/src/cpp/src/text2image/models/clip_text_model.cpp b/src/cpp/src/text2image/models/clip_text_model.cpp index d42a07f9c1..b8ec871eb0 100644 --- a/src/cpp/src/text2image/models/clip_text_model.cpp +++ b/src/cpp/src/text2image/models/clip_text_model.cpp @@ -8,6 +8,7 @@ #include "openvino/runtime/core.hpp" #include "utils.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -21,6 +22,7 @@ CLIPTextModel::Config::Config(const std::string& config_path) { read_json_param(data, "max_position_embeddings", max_position_embeddings); read_json_param(data, "hidden_size", hidden_size); + read_json_param(data, "num_hidden_layers", num_hidden_layers); } CLIPTextModel::CLIPTextModel(const std::string root_dir) : @@ -33,7 +35,13 @@ CLIPTextModel::CLIPTextModel(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) : CLIPTextModel(root_dir) { - compile(device, properties); + AdapterConfig adapters; + if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { + m_adapter_controller = AdapterController(m_model, adapters, "lora_te", device); + compile(device, *filtered_properties); + } else { + compile(device, properties); + } } CLIPTextModel::CLIPTextModel(const CLIPTextModel&) = default; @@ -64,6 +72,10 @@ CLIPTextModel& CLIPTextModel::compile(const std::string& device, const ov::AnyMa return *this; } +void CLIPTextModel::set_adapters(const AdapterConfig& adapters) { + m_adapter_controller.apply(m_request, adapters); +} + ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) { OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); @@ -100,5 +112,9 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string return m_request.get_output_tensor(0); } +ov::Tensor CLIPTextModel::get_output_tensor(const size_t idx) { + return m_request.get_output_tensor(idx); +} + } // namespace genai } // namespace ov diff --git a/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp b/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp new file mode 100644 index 0000000000..2fa7b83738 --- /dev/null +++ b/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/clip_text_model_with_projection.hpp" + +#include + +#include "openvino/runtime/core.hpp" + +#include "utils.hpp" + +namespace ov { +namespace genai { + +CLIPTextModelWithProjection::Config::Config(const std::string& config_path) { + std::ifstream file(config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "max_position_embeddings", max_position_embeddings); + read_json_param(data, "hidden_size", hidden_size); + read_json_param(data, "num_hidden_layers", num_hidden_layers); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string root_dir) : + m_clip_tokenizer(root_dir + "/../tokenizer_2"), + m_config(root_dir + "/config.json") { + m_model = ov::Core().read_model(root_dir + "/openvino_model.xml"); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties) : + CLIPTextModelWithProjection(root_dir) { + compile(device, properties); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const CLIPTextModelWithProjection&) = default; + +const CLIPTextModelWithProjection::Config& CLIPTextModelWithProjection::get_config() const { + return m_config; +} + +CLIPTextModelWithProjection& CLIPTextModelWithProjection::reshape(int batch_size) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); + + ov::PartialShape input_shape = m_model->input(0).get_partial_shape(); + input_shape[0] = batch_size; + input_shape[1] = m_config.max_position_embeddings; + std::map idx_to_shape{{0, input_shape}}; + m_model->reshape(idx_to_shape); + + return *this; +} + +CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::string& device, const ov::AnyMap& properties) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); + ov::CompiledModel compiled_model = ov::Core().compile_model(m_model, device, properties); + m_request = compiled_model.create_infer_request(); + // release the original model + m_model.reset(); + + return *this; +} + +ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) { + OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); + + const int32_t pad_token_id = m_clip_tokenizer.get_pad_token_id(); + const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1; + + auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) { + std::fill_n(input_ids.data(), input_ids.get_size(), pad_token_id); + + ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids; + std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + }; + + ov::Tensor input_ids(ov::element::i64, {text_embedding_batch_size, m_config.max_position_embeddings}); + size_t current_batch_idx = 0; + + if (do_classifier_free_guidance) { + perform_tokenization(neg_prompt, + ov::Tensor(input_ids, {current_batch_idx , 0}, + {current_batch_idx + 1, m_config.max_position_embeddings})); + ++current_batch_idx; + } else { + // Negative prompt is ignored when --guidanceScale < 1.0 + } + + perform_tokenization(pos_prompt, + ov::Tensor(input_ids, {current_batch_idx , 0}, + {current_batch_idx + 1, m_config.max_position_embeddings})); + + // text embeddings + m_request.set_tensor("input_ids", input_ids); + m_request.infer(); + + return m_request.get_output_tensor(0); +} + +ov::Tensor CLIPTextModelWithProjection::get_output_tensor(const size_t idx) { + return m_request.get_output_tensor(idx); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/models/unet2d_condition_model.cpp b/src/cpp/src/text2image/models/unet2d_condition_model.cpp index 69563fb7c4..d356515678 100644 --- a/src/cpp/src/text2image/models/unet2d_condition_model.cpp +++ b/src/cpp/src/text2image/models/unet2d_condition_model.cpp @@ -8,6 +8,7 @@ #include "openvino/runtime/core.hpp" #include "utils.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -36,7 +37,13 @@ UNet2DConditionModel::UNet2DConditionModel(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) : UNet2DConditionModel(root_dir) { - compile(device, properties); + AdapterConfig adapters; + if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { + m_adapter_controller = AdapterController(m_model, adapters, "lora_unet", device); + compile(device, *filtered_properties); + } else { + compile(device, properties); + } } UNet2DConditionModel::UNet2DConditionModel(const UNet2DConditionModel&) = default; @@ -64,7 +71,7 @@ UNet2DConditionModel& UNet2DConditionModel::reshape(int batch_size, int height, name_to_shape[input_name][0] = 1; } else if (input_name == "sample") { name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width}; - } else if (input_name == "time_ids") { + } else if (input_name == "time_ids" || input_name == "text_embeds") { name_to_shape[input_name][0] = batch_size; } else if (input_name == "encoder_hidden_states") { name_to_shape[input_name][0] = batch_size; @@ -92,6 +99,10 @@ void UNet2DConditionModel::set_hidden_states(const std::string& tensor_name, ov: m_request.set_tensor(tensor_name, encoder_hidden_states); } +void UNet2DConditionModel::set_adapters(const AdapterConfig& adapters) { + m_adapter_controller.apply(m_request, adapters); +} + ov::Tensor UNet2DConditionModel::infer(ov::Tensor sample, ov::Tensor timestep) { OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model"); diff --git a/src/cpp/src/text2image/numpy_utils.cpp b/src/cpp/src/text2image/numpy_utils.cpp new file mode 100644 index 0000000000..9554681820 --- /dev/null +++ b/src/cpp/src/text2image/numpy_utils.cpp @@ -0,0 +1,79 @@ +#include "text2image/numpy_utils.hpp" +#include "openvino/core/except.hpp" + +namespace ov { +namespace genai { +namespace numpy_utils { + +void rescale_zero_terminal_snr(std::vector& betas) { + // Convert betas to alphas_bar_sqrt + std::vector alphas, alphas_bar_sqrt; + for (float b : betas) { + alphas.push_back(1.0f - b); + } + + for (size_t i = 1; i <= alphas.size(); ++i) { + float alpha_cumprod = + std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies{}); + alphas_bar_sqrt.push_back(std::sqrt(alpha_cumprod)); + } + + float alphas_bar_sqrt_0 = alphas_bar_sqrt[0]; + float alphas_bar_sqrt_T = alphas_bar_sqrt[alphas_bar_sqrt.size() - 1]; + + for (float& x : alphas_bar_sqrt) { + // Shift so the last timestep is zero. + x = x - alphas_bar_sqrt_T; + // Scale so the first timestep is back to the old value. + x *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T); + // Revert sqrt + x = std::pow(x, 2); + } + + // Revert cumprod + std::vector end = alphas_bar_sqrt, begin = alphas_bar_sqrt; + end.erase(end.begin()); + begin.pop_back(); + + alphas[0] = alphas_bar_sqrt[0]; + for (size_t i = 1; i < alphas.size(); ++i) { + alphas[i] = end[i - 1] / begin[i - 1]; + } + + std::transform(alphas.begin(), alphas.end(), betas.begin(), [](float x) { + return (1 - x); + }); +} + +std::vector interp(const std::vector& x, const std::vector& xp, const std::vector& fp) { + OPENVINO_ASSERT(xp.size() == fp.size(), "`xp` and `fp`vectors must have the same sizes"); + + std::vector interp_res; + + for (const auto& i : x) { + if (i <= xp[0]) { + interp_res.push_back(fp[0]); + } else if (i >= xp[xp.size() - 1]) { + interp_res.push_back(fp[fp.size() - 1]); + } else { + // Find the first xp element that is not less than x[i] + auto it = std::lower_bound(xp.begin(), xp.end(), i); + + // idx of the left boundary + size_t idx = std::distance(xp.begin(), it) - 1; + + float x0 = xp[idx], x1 = xp[idx + 1]; + float y0 = fp[idx], y1 = fp[idx + 1]; + + float interp_val = (y1 - y0) / (x1 - x0) * (i - x0) + y0; + + interp_res.push_back(interp_val); + } + } + + return interp_res; +} + +} // namespace ov +} // namespace genai +} // namespace numpy_utils diff --git a/src/cpp/src/text2image/numpy_utils.hpp b/src/cpp/src/text2image/numpy_utils.hpp index 4520d35ae8..d6144eeb99 100644 --- a/src/cpp/src/text2image/numpy_utils.hpp +++ b/src/cpp/src/text2image/numpy_utils.hpp @@ -4,6 +4,11 @@ #pragma once #include +#include +#include +#include +#include +#include namespace ov { namespace genai { @@ -31,6 +36,12 @@ std::vector linspace(U start, U end, size_t num, bool endpoint = false) { return indices; } -}// namespace ov -}// namespace genai -}// namespace txt2img_utils +// Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) +void rescale_zero_terminal_snr(std::vector& betas); + +// np.interp(...) implementation +std::vector interp(const std::vector& x, const std::vector& xp, const std::vector& fp); + +} // namespace ov +} // namespace genai +} // namespace numpy_utils diff --git a/src/cpp/src/text2image/schedulers/ddim.cpp b/src/cpp/src/text2image/schedulers/ddim.cpp index 470a8c8ec0..a25cf7227e 100644 --- a/src/cpp/src/text2image/schedulers/ddim.cpp +++ b/src/cpp/src/text2image/schedulers/ddim.cpp @@ -62,6 +62,7 @@ DDIMScheduler::DDIMScheduler(const Config& scheduler_config) } if (m_config.rescale_betas_zero_snr) { + using numpy_utils::rescale_zero_terminal_snr; rescale_zero_terminal_snr(betas); } @@ -157,7 +158,7 @@ std::map DDIMScheduler::step(ov::Tensor noise_pred, ov: break; default: OPENVINO_THROW("Unsupported value for 'PredictionType'"); - } + } } // TODO: support m_config.thresholding @@ -197,45 +198,5 @@ void DDIMScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) return; } -void DDIMScheduler::rescale_zero_terminal_snr(std::vector& betas) { - // Convert betas to alphas_bar_sqrt - std::vector alphas, alphas_bar_sqrt; - for (float b : betas) { - alphas.push_back(1.0f - b); - } - - for (size_t i = 1; i <= alphas.size(); ++i) { - float alpha_cumprod = - std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies{}); - alphas_bar_sqrt.push_back(std::sqrt(alpha_cumprod)); - } - - float alphas_bar_sqrt_0 = alphas_bar_sqrt[0]; - float alphas_bar_sqrt_T = alphas_bar_sqrt[alphas_bar_sqrt.size() - 1]; - - for (float& x : alphas_bar_sqrt) { - // Shift so the last timestep is zero. - x = x - alphas_bar_sqrt_T; - // Scale so the first timestep is back to the old value. - x *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T); - // Revert sqrt - x = std::pow(x, 2); - } - - // Revert cumprod - std::vector end = alphas_bar_sqrt, begin = alphas_bar_sqrt; - end.erase(end.begin()); - begin.pop_back(); - - alphas[0] = alphas_bar_sqrt[0]; - for (size_t i = 1; i < alphas.size(); ++i) { - alphas[i] = end[i - 1] / begin[i - 1]; - } - - std::transform(alphas.begin(), alphas.end(), betas.begin(), [](float x) { - return (1 - x); - }); -} - } // namespace genai } // namespace ov diff --git a/src/cpp/src/text2image/schedulers/ddim.hpp b/src/cpp/src/text2image/schedulers/ddim.hpp index 062fc14ce5..936f4991ea 100644 --- a/src/cpp/src/text2image/schedulers/ddim.hpp +++ b/src/cpp/src/text2image/schedulers/ddim.hpp @@ -52,8 +52,6 @@ class DDIMScheduler : public IScheduler { size_t m_num_inference_steps; std::vector m_timesteps; - - void rescale_zero_terminal_snr(std::vector& betas); }; } // namespace genai diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.cpp b/src/cpp/src/text2image/schedulers/euler_discrete.cpp new file mode 100644 index 0000000000..9873a3998f --- /dev/null +++ b/src/cpp/src/text2image/schedulers/euler_discrete.cpp @@ -0,0 +1,281 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/schedulers/euler_discrete.hpp" + +#include +#include +#include +#include + +#include "text2image/numpy_utils.hpp" +#include "utils.hpp" + +namespace ov { +namespace genai { + +EulerDiscreteScheduler::Config::Config(const std::string& scheduler_config_path) { + std::ifstream file(scheduler_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "num_train_timesteps", num_train_timesteps); + read_json_param(data, "beta_start", beta_start); + read_json_param(data, "beta_end", beta_end); + read_json_param(data, "beta_schedule", beta_schedule); + read_json_param(data, "trained_betas", trained_betas); + read_json_param(data, "final_sigmas_type", final_sigmas_type); + read_json_param(data, "interpolation_type", interpolation_type); + read_json_param(data, "sigma_max", sigma_max); + read_json_param(data, "sigma_min", sigma_min); + read_json_param(data, "steps_offset", steps_offset); + read_json_param(data, "prediction_type", prediction_type); + read_json_param(data, "timestep_spacing", timestep_spacing); + read_json_param(data, "timestep_type", timestep_type); + read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); + read_json_param(data, "use_karras_sigmas", use_karras_sigmas); + read_json_param(data, "use_exponential_sigmas", use_exponential_sigmas); + read_json_param(data, "use_beta_sigmas", use_beta_sigmas); +} + +EulerDiscreteScheduler::EulerDiscreteScheduler(const std::string scheduler_config_path) + : EulerDiscreteScheduler(Config(scheduler_config_path)) {} + +EulerDiscreteScheduler::EulerDiscreteScheduler(const Config& scheduler_config) : m_config(scheduler_config) { + std::vector alphas, betas; + + using numpy_utils::linspace; + + if (!m_config.trained_betas.empty()) { + betas = m_config.trained_betas; + } else if (m_config.beta_schedule == BetaSchedule::LINEAR) { + betas = linspace(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps); + } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) { + float start = std::sqrt(m_config.beta_start); + float end = std::sqrt(m_config.beta_end); + betas = linspace(start, end, m_config.num_train_timesteps); + std::for_each(betas.begin(), betas.end(), [](float& x) { + x *= x; + }); + } else { + OPENVINO_THROW( + "'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types"); + } + + if (m_config.rescale_betas_zero_snr) { + using numpy_utils::rescale_zero_terminal_snr; + rescale_zero_terminal_snr(betas); + } + + std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [](float b) { + return 1.0f - b; + }); + + for (size_t i = 1; i <= alphas.size(); ++i) { + float alpha_cumprod = + std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies{}); + m_alphas_cumprod.push_back(alpha_cumprod); + } + + if (m_config.rescale_betas_zero_snr) { + m_alphas_cumprod.back() = std::pow(2, -24); + } + + for (auto it = m_alphas_cumprod.rbegin(); it != m_alphas_cumprod.rend(); ++it) { + float sigma = std::pow(((1 - (*it)) / (*it)), 0.5); + m_sigmas.push_back(sigma); + } + + auto linspaced = + linspace(0.0f, static_cast(m_config.num_train_timesteps - 1), m_config.num_train_timesteps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(static_cast(std::round(*it))); + } + + OPENVINO_ASSERT( + m_config.timestep_type != TimestepType::CONTINUOUS || m_config.prediction_type != PredictionType::V_PREDICTION, + "This case isn't supported: `timestep_type=continuous` and `prediction_type=v_prediction`. Please, add " + "support."); + + m_sigmas.push_back(0); +} + +void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps) { + // TODO: support `timesteps` and `sigmas` inputs + m_timesteps.clear(); + m_sigmas.clear(); + + m_num_inference_steps = num_inference_steps; + std::vector sigmas; + + OPENVINO_ASSERT( + m_config.timestep_type != TimestepType::CONTINUOUS || m_config.prediction_type != PredictionType::V_PREDICTION, + "This case isn't supported: `timestep_type=continuous` and `prediction_type=v_prediction`. Please, add " + "support."); + + switch (m_config.timestep_spacing) { + case TimestepSpacing::LINSPACE: { + using numpy_utils::linspace; + float end = static_cast(m_config.num_train_timesteps - 1); + auto linspaced = linspace(0.0f, end, num_inference_steps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(static_cast(std::round(*it))); + } + break; + } + case TimestepSpacing::LEADING: { + size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps; + for (size_t i = num_inference_steps - 1; i != -1; --i) { + m_timesteps.push_back(i * step_ratio + m_config.steps_offset); + } + break; + } + case TimestepSpacing::TRAILING: { + float step_ratio = static_cast(m_config.num_train_timesteps) / static_cast(m_num_inference_steps); + for (float i = m_config.num_train_timesteps; i > 0; i -= step_ratio) { + m_timesteps.push_back(static_cast(std::round(i)) - 1); + } + break; + } + default: + OPENVINO_THROW("Unsupported value for 'timestep_spacing'"); + } + + for (const float& i : m_alphas_cumprod) { + float sigma = std::pow(((1 - i) / i), 0.5); + sigmas.push_back(sigma); + } + + switch (m_config.interpolation_type) { + case InterpolationType::LINEAR: { + using numpy_utils::interp; + + std::vector x_data_points(sigmas.size()); + std::iota(x_data_points.begin(), x_data_points.end(), 0); + m_sigmas = interp(m_timesteps, x_data_points, sigmas); + break; + } + case InterpolationType::LOG_LINEAR: { + using numpy_utils::linspace; + + m_sigmas = linspace(std::log(sigmas.back()), std::log(sigmas[0]), num_inference_steps + 1, true); + std::transform(m_sigmas.begin(), m_sigmas.end(), m_sigmas.begin(), [](float x) { + return std::exp(x); + }); + break; + } + default: + OPENVINO_THROW("Unsupported value for 'interpolation_type'"); + } + + OPENVINO_ASSERT(!m_config.use_karras_sigmas, + "Parameter 'use_karras_sigmas' is not supported. Please, add support."); + + OPENVINO_ASSERT(!m_config.use_exponential_sigmas, + "Parameter 'use_exponential_sigmas' is not supported. Please, add support."); + + OPENVINO_ASSERT(!m_config.use_beta_sigmas, "Parameter 'use_beta_sigmas' is not supported. Please, add support."); + + float sigma_last = 0; + switch (m_config.final_sigmas_type) { + case FinalSigmaType::SIGMA_MIN: + sigma_last = std::pow(((1 - m_alphas_cumprod[0]) / m_alphas_cumprod[0]), 0.5); + break; + case FinalSigmaType::ZERO: + break; + default: + OPENVINO_THROW("Unsupported value for 'final_sigmas_type'"); + } + m_sigmas.push_back(sigma_last); +} + +std::map EulerDiscreteScheduler::step(ov::Tensor noise_pred, + ov::Tensor latents, + size_t inference_step) { + // noise_pred - model_output + // latents - sample + // inference_step + + size_t timestep = get_timesteps()[inference_step]; + + if (m_step_index == -1) + m_step_index = 0; + + float sigma = m_sigmas[m_step_index]; + // TODO: hardcoded gamma + float gamma = 0.0f; + float sigma_hat = sigma * (gamma + 1); + + float* model_output_data = noise_pred.data(); + float* sample_data = latents.data(); + + ov::Tensor pred_original_sample(noise_pred.get_element_type(), noise_pred.get_shape()); + float* pred_original_sample_data = pred_original_sample.data(); + + ov::Tensor prev_sample(noise_pred.get_element_type(), noise_pred.get_shape()); + float* prev_sample_data = prev_sample.data(); + + // 1. compute predicted original sample (x_0) from sigma-scaled predicted noise + switch (m_config.prediction_type) { + case PredictionType::EPSILON: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = sample_data[i] - model_output_data[i] * sigma_hat; + } + break; + case PredictionType::SAMPLE: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = model_output_data[i]; + } + break; + case PredictionType::V_PREDICTION: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = model_output_data[i] * (-sigma / std::pow((std::pow(sigma, 2) + 1), 0.5)) + + (sample_data[i] / (std::pow(sigma, 2) + 1)); + } + break; + default: + OPENVINO_THROW("Unsupported value for 'PredictionType'"); + } + + float dt = m_sigmas[m_step_index + 1] - sigma_hat; + + // 2. Convert to an ODE derivative + for (size_t i = 0; i < prev_sample.get_size(); ++i) { + prev_sample_data[i] = ((sample_data[i] - pred_original_sample_data[i]) / sigma_hat) * dt + sample_data[i]; + } + + m_step_index += 1; + + return {{"latent", prev_sample}, {"denoised", pred_original_sample}}; +} + +std::vector EulerDiscreteScheduler::get_timesteps() const { + return m_timesteps; +} + +float EulerDiscreteScheduler::get_init_noise_sigma() const { + float max_sigma = *std::max_element(m_sigmas.begin(), m_sigmas.end()); + + if (m_config.timestep_spacing == TimestepSpacing::LINSPACE || + m_config.timestep_spacing == TimestepSpacing::TRAILING) { + return max_sigma; + } + + return std::sqrt(max_sigma * max_sigma + 1); +} + +void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) { + if (m_step_index == -1) + m_step_index = 0; + + float sigma = m_sigmas[m_step_index]; + float* sample_data = sample.data(); + for (size_t i = 0; i < sample.get_size(); i++) { + sample_data[i] /= std::pow((std::pow(sigma, 2) + 1), 0.5); + } +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.hpp b/src/cpp/src/text2image/schedulers/euler_discrete.hpp new file mode 100644 index 0000000000..1dc60f118f --- /dev/null +++ b/src/cpp/src/text2image/schedulers/euler_discrete.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include "text2image/schedulers/types.hpp" +#include "text2image/schedulers/ischeduler.hpp" + +namespace ov { +namespace genai { + +class EulerDiscreteScheduler : public IScheduler { +public: + struct Config { + int32_t num_train_timesteps = 1000; + float beta_start = 0.0001f, beta_end = 0.02f; + BetaSchedule beta_schedule = BetaSchedule::SCALED_LINEAR; + std::vector trained_betas = {}; + FinalSigmaType final_sigmas_type = FinalSigmaType::ZERO; + InterpolationType interpolation_type = InterpolationType::LINEAR; + float sigma_max = 0.0f, sigma_min = 0.0f; + size_t steps_offset = 0; + PredictionType prediction_type = PredictionType::EPSILON; + TimestepSpacing timestep_spacing = TimestepSpacing::LEADING; + TimestepType timestep_type = TimestepType::DISCRETE; + bool rescale_betas_zero_snr = false; + bool use_karras_sigmas = false, use_exponential_sigmas = false, use_beta_sigmas = false; + + Config() = default; + explicit Config(const std::string& scheduler_config_path); + }; + + explicit EulerDiscreteScheduler(const std::string scheduler_config_path); + explicit EulerDiscreteScheduler(const Config& scheduler_config); + + void set_timesteps(size_t num_inference_steps) override; + + std::vector get_timesteps() const override; + + float get_init_noise_sigma() const override; + + void scale_model_input(ov::Tensor sample, size_t inference_step) override; + + std::map step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override; + +private: + Config m_config; + + std::vector m_alphas_cumprod, m_sigmas; + std::vector m_timesteps; + size_t m_num_inference_steps; + + size_t m_step_index = -1; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/scheduler.cpp b/src/cpp/src/text2image/schedulers/scheduler.cpp index cf14640b7c..44b08d67fc 100644 --- a/src/cpp/src/text2image/schedulers/scheduler.cpp +++ b/src/cpp/src/text2image/schedulers/scheduler.cpp @@ -10,6 +10,7 @@ #include "text2image/schedulers/lcm.hpp" #include "text2image/schedulers/lms_discrete.hpp" #include "text2image/schedulers/ddim.hpp" +#include "text2image/schedulers/euler_discrete.hpp" namespace ov { namespace genai { @@ -35,6 +36,8 @@ std::shared_ptr Text2ImagePipeline::Scheduler::fr scheduler = std::make_shared(scheduler_config_path); } else if (scheduler_type == Scheduler::Type::DDIM) { scheduler = std::make_shared(scheduler_config_path); + } else if (scheduler_type == Scheduler::Type::EULER_DISCRETE) { + scheduler = std::make_shared(scheduler_config_path); } else { OPENVINO_THROW("Unsupported scheduler type '", scheduler_type, ". Please, manually create scheduler via supported one"); } diff --git a/src/cpp/src/text2image/schedulers/types.cpp b/src/cpp/src/text2image/schedulers/types.cpp index 4ecdcea811..0ca970f359 100644 --- a/src/cpp/src/text2image/schedulers/types.cpp +++ b/src/cpp/src/text2image/schedulers/types.cpp @@ -49,6 +49,8 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Text2I param = Text2ImagePipeline::Scheduler::DDIM; else if (scheduler_type_str == "LMSDiscreteScheduler") param = Text2ImagePipeline::Scheduler::LMS_DISCRETE; + else if (scheduler_type_str == "EulerDiscreteScheduler") + param = Text2ImagePipeline::Scheduler::EULER_DISCRETE; else if (!scheduler_type_str.empty()) { OPENVINO_THROW("Unsupported value for 'prediction_type' ", scheduler_type_str); } @@ -71,6 +73,48 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Timest } } +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, InterpolationType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string interpolation_type = data[name].get(); + if (interpolation_type == "linear") + param = InterpolationType::LINEAR; + else if (interpolation_type == "log_linear") + param = InterpolationType::LOG_LINEAR; + else if (!interpolation_type.empty()) { + OPENVINO_THROW("Unsupported value for 'interpolation_type' ", interpolation_type); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, FinalSigmaType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string final_sigma_type = data[name].get(); + if (final_sigma_type == "zero") + param = FinalSigmaType::ZERO; + else if (final_sigma_type == "sigma_min") + param = FinalSigmaType::SIGMA_MIN; + else if (!final_sigma_type.empty()) { + OPENVINO_THROW("Unsupported value for 'final_sigma_type' ", final_sigma_type); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, TimestepType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string timestep_type = data[name].get(); + if (timestep_type == "discrete") + param = TimestepType::DISCRETE; + else if (timestep_type == "continuous") + param = TimestepType::CONTINUOUS; + else if (!timestep_type.empty()) { + OPENVINO_THROW("Unsupported value for 'timestep_type' ", timestep_type); + } + } +} + } // namespace utils } // namespace genai } // namespace ov @@ -83,6 +127,8 @@ std::ostream& operator<<(std::ostream& os, const ov::genai::Text2ImagePipeline:: return os << "LMSDiscreteScheduler"; case ov::genai::Text2ImagePipeline::Scheduler::Type::DDIM: return os << "DDIMScheduler"; + case ov::genai::Text2ImagePipeline::Scheduler::Type::EULER_DISCRETE: + return os << "EulerDiscreteScheduler"; case ov::genai::Text2ImagePipeline::Scheduler::Type::AUTO: return os << "AutoScheduler"; default: diff --git a/src/cpp/src/text2image/schedulers/types.hpp b/src/cpp/src/text2image/schedulers/types.hpp index 3029998f95..74fde4f993 100644 --- a/src/cpp/src/text2image/schedulers/types.hpp +++ b/src/cpp/src/text2image/schedulers/types.hpp @@ -30,6 +30,21 @@ enum class TimestepSpacing { LEADING }; +enum class InterpolationType { + LINEAR, + LOG_LINEAR +}; + +enum class FinalSigmaType { + ZERO, + SIGMA_MIN +}; + +enum class TimestepType { + DISCRETE, + CONTINUOUS +}; + namespace utils { template <> @@ -44,6 +59,15 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Text2I template <> void read_json_param(const nlohmann::json& data, const std::string& name, TimestepSpacing& param); +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, InterpolationType& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, FinalSigmaType& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, TimestepType& param); + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/text2image/stable_diffusion_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp index 84251cb21f..54d2d43c19 100644 --- a/src/cpp/src/text2image/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp @@ -7,6 +7,7 @@ #include #include "utils.hpp" +#include "lora_helper.hpp" namespace ov { namespace genai { @@ -106,6 +107,8 @@ class Text2ImagePipeline::StableDiffusionPipeline : public Text2ImagePipeline::D // initialize generation config initialize_generation_config(data["_class_name"].get()); + + update_adapters_from_properties(properties, m_generation_config.adapters); } StableDiffusionPipeline( @@ -149,6 +152,9 @@ class Text2ImagePipeline::StableDiffusionPipeline : public Text2ImagePipeline::D generation_config.width = unet_config.sample_size * vae_scale_factor; check_inputs(generation_config.height, generation_config.width); + m_clip_text_encoder->set_adapters(generation_config.adapters); + m_unet->set_adapters(generation_config.adapters); + if (generation_config.random_generator == nullptr) { uint32_t seed = time(NULL); generation_config.random_generator = std::make_shared(seed); diff --git a/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp new file mode 100644 index 0000000000..95ea2abc5d --- /dev/null +++ b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp @@ -0,0 +1,345 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/diffusion_pipeline.hpp" + +#include +#include + +#include "utils.hpp" + +namespace ov { +namespace genai { + +class Text2ImagePipeline::StableDiffusionXLPipeline : public Text2ImagePipeline::DiffusionPipeline { +public: + explicit StableDiffusionXLPipeline(const std::string& root_dir) { + const std::string model_index_path = root_dir + "/model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get(); + if (text_encoder == "CLIPTextModel") { + m_clip_text_encoder = std::make_shared(root_dir + "/text_encoder"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string text_encoder_2 = data["text_encoder_2"][1].get(); + if (text_encoder_2 == "CLIPTextModelWithProjection") { + m_clip_text_encoder_with_projection = std::make_shared(root_dir + "/text_encoder_2"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string unet = data["unet"][1].get(); + if (unet == "UNet2DConditionModel") { + m_unet = std::make_shared(root_dir + "/unet"); + } else { + OPENVINO_THROW("Unsupported '", unet, "' UNet type"); + } + + const std::string vae = data["vae"][1].get(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared(root_dir + "/vae_decoder"); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get()); + } + + StableDiffusionXLPipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { + const std::string model_index_path = root_dir + "/model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get(); + if (text_encoder == "CLIPTextModel") { + m_clip_text_encoder = std::make_shared(root_dir + "/text_encoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string text_encoder_2 = data["text_encoder_2"][1].get(); + if (text_encoder_2 == "CLIPTextModelWithProjection") { + m_clip_text_encoder_with_projection = std::make_shared(root_dir + "/text_encoder_2", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string unet = data["unet"][1].get(); + if (unet == "UNet2DConditionModel") { + m_unet = std::make_shared(root_dir + "/unet", device, properties); + } else { + OPENVINO_THROW("Unsupported '", unet, "' UNet type"); + } + + const std::string vae = data["vae"][1].get(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared(root_dir + "/vae_decoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get()); + } + + StableDiffusionXLPipeline( + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) + : m_clip_text_encoder(std::make_shared(clip_text_model)), + m_clip_text_encoder_with_projection(std::make_shared(clip_text_model_with_projection)), + m_unet(std::make_shared(unet)), + m_vae_decoder(std::make_shared(vae_decoder)) { } + + void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) override { + check_inputs(height, width); + + const size_t batch_size_multiplier = do_classifier_free_guidance(guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + m_clip_text_encoder->reshape(batch_size_multiplier); + m_clip_text_encoder_with_projection->reshape(batch_size_multiplier); + m_unet->reshape(num_images_per_prompt * batch_size_multiplier, height, width, m_clip_text_encoder->get_config().max_position_embeddings); + m_vae_decoder->reshape(num_images_per_prompt, height, width); + } + + void compile(const std::string& device, const ov::AnyMap& properties) override { + m_clip_text_encoder->compile(device, properties); + m_clip_text_encoder_with_projection->compile(device, properties); + m_unet->compile(device, properties); + m_vae_decoder->compile(device, properties); + } + + ov::Tensor generate(const std::string& positive_prompt, + const ov::AnyMap& properties) override { + GenerationConfig generation_config = m_generation_config; + generation_config.update_generation_config(properties); + + // Stable Diffusion pipeline + // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline + + const auto& unet_config = m_unet->get_config(); + const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + + if (generation_config.height < 0) + generation_config.height = unet_config.sample_size * vae_scale_factor; + if (generation_config.width < 0) + generation_config.width = unet_config.sample_size * vae_scale_factor; + check_inputs(generation_config.height, generation_config.width); + + if (generation_config.random_generator == nullptr) { + uint32_t seed = time(NULL); + generation_config.random_generator = std::make_shared(seed); + } + + std::vector time_ids = {static_cast(generation_config.width), + static_cast(generation_config.height), + 0, + 0, + static_cast(generation_config.width), + static_cast(generation_config.height), + }; + ov::Tensor add_time_ids(ov::element::f32, {batch_size_multiplier, time_ids.size()}); + float* add_time_ids_data = add_time_ids.data(); + std::copy(time_ids.begin(), time_ids.end(), add_time_ids_data); + + if (batch_size_multiplier > 1) { + std::copy(time_ids.begin(), time_ids.end(), add_time_ids_data + time_ids.size()); + } + + ov::Tensor add_text_embeds = m_clip_text_encoder_with_projection->infer(positive_prompt, generation_config.negative_prompt, batch_size_multiplier > 1); + m_clip_text_encoder->infer(positive_prompt, generation_config.negative_prompt, batch_size_multiplier > 1); + + // prompt_embeds = prompt_embeds.hidden_states[-2] + size_t idx_hidden_state_1 = m_clip_text_encoder->get_config().num_hidden_layers; + ov::Tensor encoder_hidden_states_1 = m_clip_text_encoder->get_output_tensor(idx_hidden_state_1); + size_t idx_hidden_state_2 = m_clip_text_encoder_with_projection->get_config().num_hidden_layers; + ov::Tensor encoder_hidden_states_2 = m_clip_text_encoder_with_projection->get_output_tensor(idx_hidden_state_2); + + ov::Shape ehs_1_shape = encoder_hidden_states_1.get_shape(); + ov::Shape ehs_2_shape = encoder_hidden_states_2.get_shape(); + + OPENVINO_ASSERT(ehs_1_shape[0] == ehs_2_shape[0] && ehs_1_shape[1] == ehs_2_shape[1], + "Tensors for concatenation must have the same dimensions"); + + // concatenate hidden_states from two encoders + ov::Shape encoder_hidden_states_shape = {ehs_1_shape[0], ehs_1_shape[1], ehs_1_shape[2] + ehs_2_shape[2]}; + ov::Tensor encoder_hidden_states(encoder_hidden_states_1.get_element_type(), encoder_hidden_states_shape); + + const float* ehs_1_data = encoder_hidden_states_1.data(); + const float* ehs_2_data = encoder_hidden_states_2.data(); + float* encoder_hidden_states_data = encoder_hidden_states.data(); + + for (size_t i = 0; i < ehs_1_shape[0]; ++i) { + for (size_t j = 0; j < ehs_1_shape[1]; ++j) { + size_t offset_1 = (i * ehs_1_shape[1] + j) * ehs_1_shape[2]; + size_t offset_2 = (i * ehs_2_shape[1] + j) * ehs_2_shape[2]; + + size_t step = (i * ehs_1_shape[1] + j) * (ehs_1_shape[2] + ehs_2_shape[2]); + + std::memcpy(encoder_hidden_states_data + step, ehs_1_data + offset_1, ehs_1_shape[2] * sizeof(float)); + std::memcpy(encoder_hidden_states_data + step + ehs_1_shape[2], ehs_2_data + offset_2, ehs_2_shape[2] * sizeof(float)); + } + } + + // replicate encoder hidden state to UNet model + if (generation_config.num_images_per_prompt == 1) { + // reuse output of text encoder directly w/o extra memory copy + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states); + m_unet->set_hidden_states("text_embeds", add_text_embeds); + m_unet->set_hidden_states("time_ids", add_time_ids); + + } else { + ov::Shape enc_shape = encoder_hidden_states.get_shape(); + enc_shape[0] *= generation_config.num_images_per_prompt; + + ov::Tensor encoder_hidden_states_repeated(encoder_hidden_states.get_element_type(), enc_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states_repeated); + + ov::Shape t_emb_shape = add_text_embeds.get_shape(); + t_emb_shape[0] *= generation_config.num_images_per_prompt; + + ov::Tensor add_text_embeds_repeated(add_text_embeds.get_element_type(), t_emb_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(add_text_embeds, add_text_embeds_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(add_text_embeds, add_text_embeds_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("text_embeds", add_text_embeds_repeated); + + ov::Shape t_ids_shape = add_time_ids.get_shape(); + t_ids_shape[0] *= generation_config.num_images_per_prompt; + ov::Tensor add_time_ids_repeated(add_time_ids.get_element_type(), t_ids_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(add_time_ids, add_time_ids_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(add_time_ids, add_time_ids_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("time_ids", add_time_ids_repeated); + } + + m_scheduler->set_timesteps(generation_config.num_inference_steps); + std::vector timesteps = m_scheduler->get_timesteps(); + + // latents are multiplied by 'init_noise_sigma' + ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, + generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; + ov::Shape latent_shape_cfg = latent_shape; + latent_shape_cfg[0] *= batch_size_multiplier; + + ov::Tensor latent(ov::element::f32, latent_shape), latent_cfg(ov::element::f32, latent_shape_cfg); + std::generate_n(latent.data(), latent.get_size(), [&]() -> float { + return generation_config.random_generator->next() * m_scheduler->get_init_noise_sigma(); + }); + + ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); + for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; inference_step++) { + // concat the same latent twice along a batch dimension in case of CFG + if (batch_size_multiplier > 1) { + batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); + batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); + } else { + // just assign to save memory copy + latent_cfg = latent; + } + + m_scheduler->scale_model_input(latent_cfg, inference_step); + + ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); + ov::Tensor noise_pred_tensor = m_unet->infer(latent_cfg, timestep); + + ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); + noise_pred_shape[0] /= batch_size_multiplier; + noisy_residual_tensor.set_shape(noise_pred_shape); + + if (batch_size_multiplier > 1) { + // perform guidance + float* noisy_residual = noisy_residual_tensor.data(); + const float* noise_pred_uncond = noise_pred_tensor.data(); + const float* noise_pred_text = noise_pred_uncond + noisy_residual_tensor.get_size(); + + for (size_t i = 0; i < noisy_residual_tensor.get_size(); ++i) { + noisy_residual[i] = noise_pred_uncond[i] + + generation_config.guidance_scale * (noise_pred_text[i] - noise_pred_uncond[i]); + } + } else { + noisy_residual_tensor = noise_pred_tensor; + } + + auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step); + latent = scheduler_step_result["latent"]; + + // check whether scheduler returns "denoised" image, which should be passed to VAE decoder + const auto it = scheduler_step_result.find("denoised"); + denoised = it != scheduler_step_result.end() ? it->second : latent; + } + + return m_vae_decoder->infer(denoised); + } + +private: + bool do_classifier_free_guidance(float guidance_scale) const { + return guidance_scale > 1.0 && m_unet->get_config().time_cond_proj_dim < 0; + } + + void initialize_generation_config(const std::string& class_name) override { + assert(m_unet != nullptr); + const auto& unet_config = m_unet->get_config(); + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + + m_generation_config.height = unet_config.sample_size * vae_scale_factor; + m_generation_config.width = unet_config.sample_size * vae_scale_factor; + + if (class_name == "StableDiffusionXLPipeline") { + m_generation_config.guidance_scale = 5.0f; + m_generation_config.num_inference_steps = 50; + } else { + OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers"); + } + } + + void check_inputs(const int height, const int width) const override { + assert(m_unet != nullptr); + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && + (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by", + vae_scale_factor); + } + + std::shared_ptr m_clip_text_encoder; + std::shared_ptr m_clip_text_encoder_with_projection; + std::shared_ptr m_unet; + std::shared_ptr m_vae_decoder; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/text2image_pipeline.cpp b/src/cpp/src/text2image/text2image_pipeline.cpp index b8a8e1898b..f7a6ab65ae 100644 --- a/src/cpp/src/text2image/text2image_pipeline.cpp +++ b/src/cpp/src/text2image/text2image_pipeline.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "text2image/stable_diffusion_pipeline.hpp" +#include "text2image/stable_diffusion_xl_pipeline.hpp" #include #include @@ -44,7 +45,8 @@ void Text2ImagePipeline::GenerationConfig::update_generation_config(const ov::An read_anymap_param(properties, "height", height); read_anymap_param(properties, "width", width); read_anymap_param(properties, "num_inference_steps", num_inference_steps); - + read_anymap_param(properties, "adapters", adapters); + validate(); } @@ -60,8 +62,10 @@ Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir) { const std::string class_name = get_class_name(root_dir); if (class_name == "StableDiffusionPipeline" || - class_name == "LatentConsistencyModelPipeline") { + class_name == "LatentConsistencyModelPipeline") { m_impl = std::make_shared(root_dir); + } else if (class_name == "StableDiffusionXLPipeline") { + m_impl = std::make_shared(root_dir); } else { OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); } @@ -70,15 +74,17 @@ Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir) { Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { const std::string class_name = get_class_name(root_dir); - if (class_name == "StableDiffusionPipeline" || + if (class_name == "StableDiffusionPipeline" || class_name == "LatentConsistencyModelPipeline") { m_impl = std::make_shared(root_dir, device, properties); + } else if (class_name == "StableDiffusionXLPipeline") { + m_impl = std::make_shared(root_dir, device, properties); } else { OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); } } -Text2ImagePipeline::Text2ImagePipeline(const std::shared_ptr& impl) +Text2ImagePipeline::Text2ImagePipeline(const std::shared_ptr& impl) : m_impl(impl) { assert(m_impl != nullptr); } @@ -104,6 +110,20 @@ Text2ImagePipeline Text2ImagePipeline::latent_consistency_model( return stable_diffusion(scheduler, clip_text_model, unet, vae_decoder); } +Text2ImagePipeline Text2ImagePipeline::stable_diffusion_xl( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) { + auto impl = std::make_shared(clip_text_model, clip_text_model_with_projection, unet, vae_decoder); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Text2ImagePipeline(impl); +} + Text2ImagePipeline::GenerationConfig Text2ImagePipeline::get_generation_config() const { return m_impl->get_generation_config(); } diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 823ac7fd6e..229c418e54 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -211,6 +211,20 @@ std::pair split_core_complile_config(const ov::AnyMap& p return {core_config, compile_config}; }; +ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) { + auto minuend_size = minuend.input_ids.get_size(); + auto subtrahend_size = subtrahend.input_ids.get_size(); + ov::Shape new_shape{1, minuend_size - subtrahend_size}; + + ov::Tensor new_input_ids(ov::element::i64, new_shape); + auto data_ptr = minuend.input_ids.data(); + std::copy(data_ptr + subtrahend_size, data_ptr + minuend_size, new_input_ids.data()); + + ov::Tensor new_attention_mask(ov::element::i64, new_shape); + std::fill_n(new_attention_mask.data(), new_shape[1], 1); + + return {new_input_ids, new_attention_mask}; +} } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index fe6e4eed14..2b7ff18e2d 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -86,20 +86,7 @@ ProcessorConfig from_any_map( std::pair split_core_complile_config(const ov::AnyMap& plugin_config); -inline ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){ - auto first_size = fisrt.input_ids.get_size(); - auto second_size = second.input_ids.get_size(); - ov::Shape new_shape{1, first_size - second_size}; - - ov::Tensor new_input_ids(ov::element::i64, new_shape); - auto data_ptr = fisrt.input_ids.data(); - std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data()); - - ov::Tensor new_attention_mask(ov::element::i64, new_shape); - std::fill_n(new_attention_mask.data(), new_shape[1], 1); - - return {new_input_ids, new_attention_mask}; -} +ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend); } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp index 05539b67dc..8e8612697c 100644 --- a/src/cpp/src/vision_encoder.cpp +++ b/src/cpp/src/vision_encoder.cpp @@ -228,7 +228,7 @@ std::vector bucket_size_right(const std::vector& fractional_coor ov::Tensor prepare_vis_position_ids( const ov::Tensor& pixel_values, const ov::Tensor& patch_attention_mask, - const std::vector tgt_sizes, + const std::vector tgt_sizes, size_t patch_size, size_t num_patches_per_side ) { @@ -283,7 +283,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o }; std::vector> imgs = ::slice_image(source, max_slice_nums, scale_resolution, patch_size, never_split); std::vector> results; - std::vector> sizes; + std::vector> sizes; // std::vector img_res_v; // format N x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 std::vector> preprocessed{imgs.size()}; @@ -296,7 +296,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o }); const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0); - HeightWidth resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size}; + ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size}; ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())}; ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size); encoder.set_tensor("pixel_values", pixel_values); @@ -314,27 +314,29 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o return {std::move(resized_source), resized_source_size}; } - HeightWidth size{ + ImageSize raw_size{ size_t(preprocessed.at(1).at(0).ny), size_t(preprocessed.at(1).at(0).nx) }; - std::vector sliced_sizes; - size_t n_patches = size.height / patch_size * size.width / patch_size, + ImageSize slices_size{ + raw_size.height / patch_size, + raw_size.width / patch_size + }; + size_t n_patches = slices_size.height * slices_size.width, old_hidden_size = resized_source.get_shape().at(2); ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}}; for (size_t row = 1; row < preprocessed.size(); ++row) { for (size_t col = 0; col < preprocessed.at(row).size(); ++col) { clip_image_f32& elem = preprocessed.at(row).at(col); - sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size}); ov::Tensor pixel_values = preprocess_for_encoder( {ov::element::f32, {1, 3, size_t(elem.ny), size_t(elem.nx)}, elem.buf.data()}, patch_size ); encoder.set_tensor("pixel_values", pixel_values); - ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, sliced_sizes.back().height * sliced_sizes.back().width}}; + ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, slices_size.height * slices_size.width}}; std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), true); encoder.set_tensor("patch_attention_mask", patch_attention_mask); - ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {sliced_sizes.back()}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); + ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); encoder.set_tensor("position_ids", position_ids); const ov::Tensor& old = encoder.get_output_tensor(); encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size}); @@ -342,7 +344,23 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o encoder.set_output_tensor(old); } } - return {resized_source, resized_source_size, encoded_slices, sliced_sizes}; + return {resized_source, resized_source_size, encoded_slices, slices_size}; +} + +ProcessorConfig from_any_map( + const ov::AnyMap& config_map, + const ProcessorConfig& initial +) { + auto iter = config_map.find("processor_config"); + ProcessorConfig extracted_config = config_map.end() != iter ? + iter->second.as() : initial; + using utils::read_anymap_param; + read_anymap_param(config_map, "patch_size", extracted_config.patch_size); + read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution); + read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums); + read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean); + read_anymap_param(config_map, "norm_std", extracted_config.norm_std); + return extracted_config; } } @@ -366,7 +384,7 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfi } EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) { - return encode(image, utils::from_any_map( + return encode(image, from_any_map( config_map, m_processor_config )); } diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index 99c38c976d..3bdc3d9ae9 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -94,7 +94,7 @@ ov::Tensor process_prompt(ov::InferRequest& embedding, const ov::Tensor& prompt, return embed_output_tensor; } -ov::Tensor concatenate(const ov::Tensor& first, const ov::Tensor& second) { +ov::Tensor concatenate_last_dim(const ov::Tensor& first, const ov::Tensor& second) { size_t res_d_0 = first.get_shape().at(0); size_t res_d_1 = first.get_shape().at(1); OPENVINO_ASSERT(second.get_shape().at(0) == res_d_0); @@ -187,7 +187,7 @@ ov::Tensor get_1d_sincos_pos_embed_from_grid_new(size_t embed_dim, const ov::Ten std::transform(out_data, out_data + out.get_size(), emb_cos_data, [](float arg) { return std::cos(arg); }); - return concatenate(emb_sin, emb_cos); // (H, W, D) + return concatenate_last_dim(emb_sin, emb_cos); // (H, W, D) } ov::Tensor get_2d_sincos_pos_embed_from_grid(size_t embed_dim, const ov::Tensor& grid) { @@ -201,13 +201,13 @@ ov::Tensor get_2d_sincos_pos_embed_from_grid(size_t embed_dim, const ov::Tensor& end_w.at(0) = 2; ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{grid, begin_h, end_h}); // (H, W, D/2) ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{grid, begin_w, end_w}); // (H, W, D/2) - return concatenate(emb_h, emb_w); + return concatenate_last_dim(emb_h, emb_w); } /// image_size: image_size or (image_height, image_width) /// return: /// pos_embed: [image_height, image_width, embed_dim] -ov::Tensor get_2d_sincos_pos_embed(size_t embed_dim, const HeightWidth& image_size) { +ov::Tensor get_2d_sincos_pos_embed(size_t embed_dim, const ImageSize& image_size) { size_t grid_h_size = image_size.height, grid_w_size = image_size.width; ov::Tensor grid(ov::element::f32, {2, grid_h_size, grid_w_size}); float* data = grid.data(); @@ -223,14 +223,14 @@ ov::Tensor get_2d_sincos_pos_embed(size_t embed_dim, const HeightWidth& image_si } void adjust_pos_cache( - const std::vector& target_sizes, + const std::vector& target_sizes, size_t hidden_size, ov::Tensor& pos_embed_cache ) { - size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const HeightWidth& left, const HeightWidth& right) { + size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { return left.height < right.height; })->height; - size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const HeightWidth& left, const HeightWidth& right) { + size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { return left.width < right.width; })->width; size_t allocated_height, allocated_width; @@ -250,10 +250,10 @@ void adjust_pos_cache( } } -ov::Tensor resample(VLMPipeline& pipe, const ov::Tensor& encoded_image, const std::vector& target_sizes) { +ov::Tensor resample(VLMPipeline& pipe, const ov::Tensor& encoded_image, const std::vector& target_sizes) { size_t bs = encoded_image.get_shape().at(0); std::vector patch_len{target_sizes.size()}; - std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const HeightWidth& height_width) { + std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) { return height_width.height * height_width.width; }); adjust_pos_cache( @@ -301,25 +301,23 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { VLMPipeline::VLMPipeline( const std::filesystem::path& model_dir, - const Tokenizer& tokenizer, const std::string& device, - const ov::AnyMap device_config, - ov::Core core + const ov::AnyMap device_config ) : m_vlm_config{ utils::from_config_json_if_exists( model_dir, "config.json" ) }, - m_tokenizer{tokenizer}, - m_vision_encoder(model_dir, device, device_config, core), - m_resampler{core.compile_model( + m_tokenizer{Tokenizer(model_dir.string(), device_config)}, + m_vision_encoder(model_dir, device, device_config, ov::Core{}), + m_resampler{ov::Core{}.compile_model( model_dir / "resampler.xml", device, device_config ).create_infer_request()}, - m_embedding{core.compile_model( + m_embedding{ov::Core{}.compile_model( model_dir / "embed_tokens.xml", device, device_config ).create_infer_request()}, - m_language{core.compile_model( + m_language{ov::Core{}.compile_model( model_dir / "language_model.xml", device, device_config ).create_infer_request()}, m_pos_embed_cache{ @@ -340,31 +338,48 @@ DecodedResults VLMPipeline::generate( std::string images_prompt; std::vector embeds; for (const ov::Tensor& rgb : rgbs) { - EncodedImage encoded_image = m_vision_encoder.encode(rgb); - if (m_vlm_config.use_image_id) { - images_prompt += m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end; - ++image_id; + ov::Tensor reshaped = rgb; + ov::Shape rgb_shape = rgb.get_shape(); + switch (rgb_shape.size()) { + case 3: + reshaped.set_shape({1, rgb_shape.at(0), rgb_shape.at(1), rgb_shape.at(2)}); + break; + case 4: break; + default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); } - std::string unk64; - for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { - unk64 += m_vlm_config.unk; - } - images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; - if (encoded_image.slices) { - ov::Shape slices_shape = encoded_image.slices.get_shape(); - for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { - for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { - images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; + ov::Shape reshaped_shape = reshaped.get_shape(); + for (size_t batch_idx = 0; batch_idx < reshaped_shape.at(0); ++batch_idx) { + ov::Tensor single_image{ + ov::element::u8, + {1, reshaped_shape.at(1), reshaped_shape.at(2), reshaped_shape.at(3)}, + reshaped.data() + batch_idx * reshaped_shape.at(1) * reshaped_shape.at(1) * reshaped_shape.at(1) + }; + EncodedImage encoded_image = m_vision_encoder.encode(single_image); + if (m_vlm_config.use_image_id) { + images_prompt += m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end; + ++image_id; + } + std::string unk64; + for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { + unk64 += m_vlm_config.unk; + } + images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; + if (encoded_image.slices) { + ov::Shape slices_shape = encoded_image.slices.get_shape(); + for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { + for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { + images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; + } + images_prompt += '\n'; } + } + if ('\n' != *(images_prompt.end() - 1)) { + // Image wasn't sliced, add \n to the end of image anyway. + // Strangely, \n isn't placed between . images_prompt += '\n'; } + embeds.push_back(std::move(encoded_image)); } - if ('\n' != *(images_prompt.end() - 1)) { - // Image wasn't sliced, add \n to the end of image anyway. - // Strangely, \n isn't placed between . - images_prompt += '\n'; - } - embeds.push_back(std::move(encoded_image)); } images_prompt += prompt; ov::Tensor encoded_input; @@ -432,13 +447,12 @@ DecodedResults VLMPipeline::generate( if (encoded_image.slices) { size_t token_idx = 0; const ov::Shape& slices_shape = encoded_image.slices.get_shape(); - const std::vector& sliced_sizes = encoded_image.slices_sizes; for (size_t i = 0; i < slices_shape.at(0); ++i) { for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { size_t d2 = slices_shape.at(2); size_t d3 = slices_shape.at(3); ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; - const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {sliced_sizes.at(i * slices_shape.at(1) + ja)}); + const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {encoded_image.slices_size}); ids = std::find(ids, end, slice_start_id); OPENVINO_ASSERT(end != ids); std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index c032ca1a55..bf76f34f4f 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -18,7 +18,7 @@ if(NOT pybind11_POPULATED) add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR}) endif() -pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp py_whisper_pipeline.cpp utils.cpp) +pybind11_add_module(py_generate_pipeline py_vlm_pipeline.cpp py_generate_pipeline.cpp py_whisper_pipeline.cpp utils.cpp) target_link_libraries(py_generate_pipeline PRIVATE openvino::genai) set_target_properties(py_generate_pipeline PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index c4d219fcf4..879dfc8262 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -17,7 +17,8 @@ EncodedResults, GenerationConfig, GenerationResult, - LLMPipeline, + LLMPipeline, + VLMPipeline, PerfMetrics, RawPerfMetrics, SchedulerConfig, diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 74b704f34b..b636253e33 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -50,6 +50,7 @@ std::vector get_ms(const T& instance, U T::*member) { } void init_whisper_pipeline(py::module_& m); +void init_vlm_pipeline(py::module_& m); namespace { @@ -310,68 +311,6 @@ auto cache_eviction_config_docstring = R"( :type aggregation_mode: openvino_genai.AggregationMode )"; -OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& config, const py::kwargs& kwargs) { - if(!config.has_value() && kwargs.empty()) - return std::nullopt; - - GenerationConfig res_config; - if(config.has_value()) - res_config = *config; - - for (const auto& item : kwargs) { - std::string key = py::cast(item.first); - py::object value = py::cast(item.second); - - if (item.second.is_none()) { - // Even if argument key name does not fit GenerationConfig name - // it's not an eror if it's not defined. - // Some HF configs can have parameters for methods currenly unsupported in ov_genai - // but if their values are not set / None, then this should not block - // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...} - return res_config; - } - - if (key == "max_new_tokens") { - res_config.max_new_tokens = py::cast(item.second); - } else if (key == "max_length") { - res_config.max_length = py::cast(item.second); - } else if (key == "ignore_eos") { - res_config.ignore_eos = py::cast(item.second); - } else if (key == "num_beam_groups") { - res_config.num_beam_groups = py::cast(item.second); - } else if (key == "num_beams") { - res_config.num_beams = py::cast(item.second); - } else if (key == "diversity_penalty") { - res_config.diversity_penalty = py::cast(item.second); - } else if (key == "length_penalty") { - res_config.length_penalty = py::cast(item.second); - } else if (key == "num_return_sequences") { - res_config.num_return_sequences = py::cast(item.second); - } else if (key == "no_repeat_ngram_size") { - res_config.no_repeat_ngram_size = py::cast(item.second); - } else if (key == "stop_criteria") { - res_config.stop_criteria = py::cast(item.second); - } else if (key == "temperature") { - res_config.temperature = py::cast(item.second); - } else if (key == "top_p") { - res_config.top_p = py::cast(item.second); - } else if (key == "top_k") { - res_config.top_k = py::cast(item.second); - } else if (key == "do_sample") { - res_config.do_sample = py::cast(item.second); - } else if (key == "repetition_penalty") { - res_config.repetition_penalty = py::cast(item.second); - } else if (key == "eos_token_id") { - res_config.set_eos_token_id(py::cast(item.second)); - } else { - throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. " - "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters.")); - } - } - - return res_config; -} - py::list handle_utf8_results(const std::vector& decoded_res) { // pybind11 decodes strings similar to Pythons's // bytes.decode('utf-8'). It raises if the decoding fails. @@ -392,26 +331,10 @@ py::object call_common_generate( const utils::PyBindStreamerVariant& py_streamer, const py::kwargs& kwargs ) { - auto updated_config = update_config_from_kwargs(config, kwargs); + auto updated_config = ov::genai::pybind::utils::update_config_from_kwargs(config, kwargs); py::object results; EncodedInputs tensor_data; - StreamerVariant streamer = std::monostate(); - - std::visit(utils::overloaded { - [&streamer](const std::function& py_callback){ - // Wrap python streamer with manual utf-8 decoding. Do not rely - // on pybind automatic decoding since it raises exceptions on incomplete strings. - auto callback_wrapped = [&py_callback](std::string subword) -> bool { - auto py_str = PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); - return py_callback(py::reinterpret_borrow(py_str)); - }; - streamer = callback_wrapped; - }, - [&streamer](std::shared_ptr streamer_cls){ - streamer = streamer_cls; - }, - [](std::monostate none){ /*streamer is already a monostate */ } - }, py_streamer); + StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); // Call suitable generate overload for each type of input. std::visit(utils::overloaded { @@ -635,7 +558,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // Binding for GenerationConfig py::class_(m, "GenerationConfig", generation_config_docstring) .def(py::init(), py::arg("json_path"), "path where generation_config.json is stored") - .def(py::init([](py::kwargs kwargs) { return *update_config_from_kwargs(GenerationConfig(), kwargs); })) + .def(py::init([](py::kwargs kwargs) { return *ov::genai::pybind::utils::update_config_from_kwargs(GenerationConfig(), kwargs); })) .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) .def_readwrite("max_length", &GenerationConfig::max_length) .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) @@ -840,4 +763,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // init whisper bindings init_whisper_pipeline(m); + + // init vlm pipeline + init_vlm_pipeline(m); } diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp new file mode 100644 index 0000000000..04faed542a --- /dev/null +++ b/src/python/py_vlm_pipeline.cpp @@ -0,0 +1,180 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + + +#include +#include +#include +#include +#include +#include "openvino/genai/vlm_pipeline.hpp" +#include "../cpp/src/tokenizers_path.hpp" +#include "./utils.hpp" + +namespace py = pybind11; +namespace utils = ov::genai::pybind::utils; + + +auto vlm_generate_docstring = R"( + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + + :param images: list of images + :type inputs: List[ov.Tensor] + + :param generation_config: generation_config + :type generation_config: GenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. + :type : Dict + + :return: return results in decoded form + :rtype: DecodedResults +)"; + +auto vlm_generate_kwargs_docstring = R"( + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. + + Expected parameters list: + image: ov.Tensor - input image, + images: List[ov.Tensor] - input images, + generation_config: GenerationConfig, + streamer: Callable[[str], bool], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped + + :return: return results in decoded form + :rtype: DecodedResults +)"; + +py::object call_vlm_generate( + ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const std::vector& images, + const ov::genai::GenerationConfig& generation_config, + const utils::PyBindStreamerVariant& py_streamer, + const py::kwargs& kwargs +) { + auto updated_config = *ov::genai::pybind::utils::update_config_from_kwargs(generation_config, kwargs); + ov::genai::StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); + + return py::cast(pipe.generate(prompt, images, updated_config, streamer)); +} + +py::object call_vlm_generate( + ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const py::kwargs& kwargs +) { + ov::AnyMap params = {}; + + for (const auto& item : kwargs) { + std::string key = py::cast(item.first); + py::object value = py::cast(item.second); + + if (key == "images") { + params.insert({ov::genai::images(std::move(py::cast>(item.second)))}); + } else if (key == "image") { + params.insert({ov::genai::image(std::move(py::cast(item.second)))}); + } else if (key == "generation_config") { + params.insert({ov::genai::generation_config(std::move(py::cast(item.second)))}); + } else if (key == "streamer") { + auto py_streamer = py::cast(value); + params.insert({ov::genai::streamer(std::move(ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer)))}); + + } else { + throw(std::invalid_argument("'" + key + "' is unexpected parameter name. " + "Use help(openvino_genai.VLMPipeline.generate) to get list of acceptable parameters.")); + } + } + + return py::cast(pipe.generate(prompt, params)); +} + +void init_vlm_pipeline(py::module_& m) { + py::class_(m, "VLMPipeline", "This class is used for generation with VLMs") + .def(py::init([]( + const std::string& model_path, + const std::string& device, + const std::map& config + ) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique(model_path, device, utils::properties_to_any_map(config)); + }), + py::arg("model_path"), "folder with exported model files", + py::arg("device") = "CPU", "device on which inference will be done", + py::arg("config") = ov::AnyMap({}), "openvino.properties map" + R"( + VLMPipeline class constructor. + model_path (str): Path to the folder with exported model files. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + + .def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "") + .def("finish_chat", &ov::genai::VLMPipeline::finish_chat) + .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config) + .def( + "generate", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const std::vector& images, + const ov::genai::GenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); + }, + py::arg("prompt"), "Input string", + py::arg("images"), "Input images", + py::arg("generation_config") = std::nullopt, "generation_config", + py::arg("streamer") = std::monostate(), "streamer", + (vlm_generate_docstring + std::string(" \n ")).c_str() + ) + .def( + "generate", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, kwargs); + }, + py::arg("prompt"), "Input string", + (vlm_generate_kwargs_docstring + std::string(" \n ")).c_str() + ) + .def( + "__call__", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const std::vector& images, + const ov::genai::GenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); + }, + py::arg("prompt"), "Input string", + py::arg("images"), "Input images", + py::arg("generation_config") = std::nullopt, "generation_config", + py::arg("streamer") = std::monostate(), "streamer", + (vlm_generate_docstring + std::string(" \n ")).c_str() + ) + .def( + "__call__", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, kwargs); + }, + py::arg("prompt"), "Input string", + (vlm_generate_kwargs_docstring + std::string(" \n ")).c_str() + ); +} diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index b7c6756e89..5d354ef93c 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -170,23 +170,7 @@ py::object call_whisper_common_generate(WhisperPipeline& pipe, auto updated_config = update_whisper_config_from_kwargs(base_config, kwargs); - StreamerVariant streamer = std::monostate(); - - std::visit(utils::overloaded{[&streamer](const std::function& py_callback) { - // Wrap python streamer with manual utf-8 decoding. Do not rely - // on pybind automatic decoding since it raises exceptions on incomplete strings. - auto callback_wrapped = [&py_callback](std::string subword) -> bool { - auto py_str = - PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); - return py_callback(py::reinterpret_borrow(py_str)); - }; - streamer = callback_wrapped; - }, - [&streamer](std::shared_ptr streamer_cls) { - streamer = streamer_cls; - }, - [](std::monostate none) { /*streamer is already a monostate */ }}, - py_streamer); + StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); return py::cast(pipe.generate(raw_speech_input, updated_config, streamer)); } diff --git a/src/python/utils.cpp b/src/python/utils.cpp index bf8f195766..65033d0866 100644 --- a/src/python/utils.cpp +++ b/src/python/utils.cpp @@ -161,4 +161,87 @@ std::string ov_tokenizers_module_path() { return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path")); } +ov::genai::StreamerVariant pystreamer_to_streamer(const utils::PyBindStreamerVariant& py_streamer) { + ov::genai::StreamerVariant streamer = std::monostate(); + + std::visit(utils::overloaded { + [&streamer](const std::function& py_callback){ + // Wrap python streamer with manual utf-8 decoding. Do not rely + // on pybind automatic decoding since it raises exceptions on incomplete strings. + auto callback_wrapped = [py_callback](std::string subword) -> bool { + auto py_str = PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); + return py_callback(py::reinterpret_borrow(py_str)); + }; + streamer = callback_wrapped; + }, + [&streamer](std::shared_ptr streamer_cls){ + streamer = streamer_cls; + }, + [](std::monostate none){ /*streamer is already a monostate */ } + }, py_streamer); + return streamer; +} + +ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::OptionalGenerationConfig& config, const py::kwargs& kwargs) { + if(!config.has_value() && kwargs.empty()) + return std::nullopt; + + ov::genai::GenerationConfig res_config; + if(config.has_value()) + res_config = *config; + + for (const auto& item : kwargs) { + std::string key = py::cast(item.first); + py::object value = py::cast(item.second); + + if (item.second.is_none()) { + // Even if argument key name does not fit GenerationConfig name + // it's not an eror if it's not defined. + // Some HF configs can have parameters for methods currenly unsupported in ov_genai + // but if their values are not set / None, then this should not block + // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...} + return res_config; + } + + if (key == "max_new_tokens") { + res_config.max_new_tokens = py::cast(item.second); + } else if (key == "max_length") { + res_config.max_length = py::cast(item.second); + } else if (key == "ignore_eos") { + res_config.ignore_eos = py::cast(item.second); + } else if (key == "num_beam_groups") { + res_config.num_beam_groups = py::cast(item.second); + } else if (key == "num_beams") { + res_config.num_beams = py::cast(item.second); + } else if (key == "diversity_penalty") { + res_config.diversity_penalty = py::cast(item.second); + } else if (key == "length_penalty") { + res_config.length_penalty = py::cast(item.second); + } else if (key == "num_return_sequences") { + res_config.num_return_sequences = py::cast(item.second); + } else if (key == "no_repeat_ngram_size") { + res_config.no_repeat_ngram_size = py::cast(item.second); + } else if (key == "stop_criteria") { + res_config.stop_criteria = py::cast(item.second); + } else if (key == "temperature") { + res_config.temperature = py::cast(item.second); + } else if (key == "top_p") { + res_config.top_p = py::cast(item.second); + } else if (key == "top_k") { + res_config.top_k = py::cast(item.second); + } else if (key == "do_sample") { + res_config.do_sample = py::cast(item.second); + } else if (key == "repetition_penalty") { + res_config.repetition_penalty = py::cast(item.second); + } else if (key == "eos_token_id") { + res_config.set_eos_token_id(py::cast(item.second)); + } else { + throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. " + "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters.")); + } + } + + return res_config; +} + } // namespace ov::genai::pybind::utils diff --git a/src/python/utils.hpp b/src/python/utils.hpp index 0a18a9c5f9..4047bdcfe7 100644 --- a/src/python/utils.hpp +++ b/src/python/utils.hpp @@ -6,6 +6,7 @@ #include #include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/llm_pipeline.hpp" namespace py = pybind11; using ov::genai::StreamerBase; @@ -33,4 +34,8 @@ std::map properties_to_any_map(const std::map Dict[str, List[str]]: file_path = TESTS_ROOT / 'data' / file_name with open(file_path, 'r') as f: - return {"questions": [s for s in f]} + return {"prompts": [s for s in f]} def get_scheduler_config(num_kv_blocks: int) -> SchedulerConfig: scheduler_config = SchedulerConfig() @@ -118,7 +118,7 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t data_dict = load_prompts_dataset(test_struct.prompt_file) - evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict, + evaluator = whowhatbench.TextEvaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict, generation_config=generation_config, generation_config_base=generation_config, max_new_tokens=test_struct.max_new_tokens, seqs_per_request=seqs_per_request)