From 1674f8065b8875ff5612f114adb500ed0a15307f Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 2 Sep 2024 08:28:09 +0400
Subject: [PATCH 01/13] update optimum-intel version (#812)

---
 llm_bench/python/benchmark.py     | 3 ++-
 llm_bench/python/requirements.txt | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
index b11b66a214..6fc5e28b08 100644
--- a/llm_bench/python/benchmark.py
+++ b/llm_bench/python/benchmark.py
@@ -726,7 +726,8 @@ def get_argprser():
 
 
 def main():
-    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout, encoding="utf-8")
+    logging_kwargs = {"encoding": "utf-8"} if sys.version_info[1] > 8 else {}
+    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout, **logging_kwargs)
     args = get_argprser()
     model_path, framework, model_args, model_name = llm_bench_utils.model_utils.analyze_args(args)
 
diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index ae31250edb..4671e44017 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -10,7 +10,7 @@ torch
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 
-git+https://github.com/huggingface/optimum-intel.git@e9800ced0f6ceaa7aa0afe67327bfe348815620d#egg=optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@b5998f2f44e581b102ed7a9b714ac0f7c2d51a66#egg=optimum-intel
 git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
 packaging
 psutil

From fd9287cde2d69b094883bb838e9ca76a3144f80b Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 3 Sep 2024 08:49:38 +0400
Subject: [PATCH 02/13] update optimum-intel commit to include mxfp4 (#816)

---
 llm_bench/python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index 4671e44017..b944d0585a 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -10,7 +10,7 @@ torch
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 
-git+https://github.com/huggingface/optimum-intel.git@b5998f2f44e581b102ed7a9b714ac0f7c2d51a66#egg=optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@9a8782446e394ac07283b8bd8b44916c4f297826#egg=optimum-intel
 git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
 packaging
 psutil

From f2545c24fbf9813ceb6d8e885f75209a21242960 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 4 Sep 2024 00:22:24 +0400
Subject: [PATCH 03/13] Use latest OpenVINO (#807)

---
 .github/workflows/causal_lm_cpp.yml           | 74 ++++++++++---------
 .github/workflows/lcm_dreamshaper_cpp.yml     | 44 +++++------
 .../workflows/stable_diffusion_1_5_cpp.yml    | 48 ++++++------
 CMakeLists.txt                                |  6 +-
 .../lcm_dreamshaper_v7/cpp/CMakeLists.txt     |  2 +-
 .../stable_diffusion_1_5/cpp/CMakeLists.txt   |  2 +-
 pyproject.toml                                |  4 +-
 thirdparty/openvino_tokenizers                |  2 +-
 8 files changed, 93 insertions(+), 89 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index f3424367ef..62de9fc671 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -13,9 +13,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/l_openvino_toolkit_ubuntu20_2024.4.0.dev20240828_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/m_openvino_toolkit_macos_12_6_2024.4.0.dev20240828_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/w_openvino_toolkit_windows_2024.4.0.dev20240828_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/m_openvino_toolkit_macos_12_6_2024.5.0.dev20240830_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
@@ -34,11 +34,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - run: >
           . ./ov/setupvars.sh
           && PYTHONPATH=./build/:$PYTHONPATH timeout 25s
@@ -77,11 +77,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - name: Compare
         run: |
           source ./ov/setupvars.sh
@@ -191,6 +191,7 @@ jobs:
 
   cpp-greedy_causal_lm-windows:
     runs-on: windows-latest
+    if: ${{ false }} # TODO: fix Windows
     env:
       PYTHONIOENCODING: "utf8"
     defaults:
@@ -203,6 +204,8 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.8
+      - name: Configure Developer Command Prompt for Microsoft Visual C++
+        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
       - run: curl --output ov.zip ${{ env.w_ov_link }}
       - run: unzip -d ov ov.zip
       - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
@@ -210,11 +213,11 @@ jobs:
       - name: Download, convert and build
         run: |
           call .\ov\setupvars.bat
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - run: >
           set PATH=.\build\openvino_genai\;%PATH%
           && call .\ov\setupvars.bat
@@ -255,11 +258,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
@@ -282,11 +285,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
@@ -310,11 +313,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j 15
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
@@ -338,11 +341,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
@@ -366,12 +369,12 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - name: run and compare
         run: |
           source ./ov/setupvars.sh
@@ -403,12 +406,12 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - name: run and compare
         run: |
           source ./ov/setupvars.sh
@@ -470,11 +473,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j 15
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
@@ -518,16 +521,14 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && convert_tokenizer ./redpajama-3b-chat/ --output ./redpajama-3b-chat/ --with-detokenizer --trust-remote-code
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-
           timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
       - name: Compare
         run: |
@@ -569,11 +570,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - name: Compare
         run: |
           source ./ov/setupvars.sh
@@ -629,11 +630,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - name: Run gtests
         run: |
           source ./ov/setupvars.sh
@@ -651,6 +652,7 @@ jobs:
 
   cpp-continuous-batching-windows:
     runs-on: windows-latest
+    if: ${{ false }} # TODO: fix Windows
     env:
       PYTHONIOENCODING: "utf8"
     defaults:
@@ -663,6 +665,8 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.8
+      - name: Configure Developer Command Prompt for Microsoft Visual C++
+        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
       - name: Install OpenVINO
         run: |
           curl --output ov.zip ${{ env.w_ov_link }}
@@ -672,11 +676,11 @@ jobs:
       - name: Install dependencies and build
         run: |
           call .\ov\setupvars.bat
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release --parallel --verbose
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - name: Run gtests
         run: |
           set PATH=.\build\openvino_genai\;%PATH%
@@ -711,11 +715,11 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
       - name: Run gtests
         run: |
           source ./ov/setupvars.sh
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index 7ef0360fa5..e76d04a073 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -16,8 +16,8 @@ permissions: read-all  # Required by https://github.com/ossf/scorecard/blob/e23b
 env:
   WORKING_DIRECTORY: "./image_generation/lcm_dreamshaper_v7/cpp/"
   PYTHON_VERSION: '3.8'
-  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/l_openvino_toolkit_ubuntu20_2024.4.0.dev20240828_x86_64.tgz
-  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/w_openvino_toolkit_windows_2024.4.0.dev20240828_x86_64.zip
+  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz
+  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip
   OV_INSTALL_DIR: ${{ github.workspace }}/ov
 
 concurrency:
@@ -41,36 +41,36 @@ jobs:
           mkdir ${{ env.OV_INSTALL_DIR }}
           tar -xzf openvino_package.tar.gz -C ${{ env.OV_INSTALL_DIR }} --strip-components=1
 
+      - name: Build app
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+        run: |
+          source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release --parallel
+
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
         with:
           python-version: ${{ env.PYTHON_VERSION }}
           cache: 'pip'
-        
+
       - name: Create virtual environment
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: python3 -m venv openvino_lcm_cpp
-      
+
       - name: Install python dependencies
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: |
           source openvino_lcm_cpp/bin/activate
           python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          
+
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: |
           source openvino_lcm_cpp/bin/activate
           optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 models/lcm_dreamshaper_v7/FP16
 
-      - name: Build app
-        working-directory: ${{ env.WORKING_DIRECTORY }}
-        run: |
-          source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --parallel
-      
       - name: Run app
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: |
@@ -96,16 +96,23 @@ jobs:
             mv ./tmp/*/* .
           popd
 
+      - name: Build app
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+        run: |
+          . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release --parallel
+
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
         with:
           python-version: ${{ env.PYTHON_VERSION }}
           cache: 'pip'
-        
+
       - name: Create virtual environment
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: python -m venv openvino_lcm_cpp
-      
+
       - name: Install python dependencies
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: |
@@ -118,14 +125,7 @@ jobs:
         run: |
           . "./openvino_lcm_cpp/Scripts/Activate.ps1"
           optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 models/lcm_dreamshaper_v7/FP16
-  
-      - name: Build app
-        working-directory: ${{ env.WORKING_DIRECTORY }}
-        run: |
-          . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --parallel
-      
+
       - name: Run app
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: |
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 18194c301c..ec440abb52 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -16,8 +16,8 @@ permissions: read-all  # Required by https://github.com/ossf/scorecard/blob/e23b
 env:
   WORKING_DIRECTORY: "./image_generation/stable_diffusion_1_5/cpp/"
   PYTHON_VERSION: '3.8'
-  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/l_openvino_toolkit_ubuntu20_2024.4.0.dev20240828_x86_64.tgz
-  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/w_openvino_toolkit_windows_2024.4.0.dev20240828_x86_64.zip
+  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz
+  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip
   OV_INSTALL_DIR: ${{ github.workspace }}/ov
 
 concurrency:
@@ -41,12 +41,19 @@ jobs:
           mkdir ${{ env.OV_INSTALL_DIR }}
           tar -xzf openvino_package.tar.gz -C ${{ env.OV_INSTALL_DIR }} --strip-components=1
 
+      - name: Build app
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+        run: |
+          source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release --parallel
+
       - name: Setup Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
         with:
           python-version: ${{ env.PYTHON_VERSION }}
           cache: 'pip'
-        
+
       - name: Create virtual environment
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: python3 -m venv openvino_sd_cpp
@@ -62,14 +69,7 @@ jobs:
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: |
           source openvino_sd_cpp/bin/activate
-          optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16
-
-      - name: Build app
-        working-directory: ${{ env.WORKING_DIRECTORY }}
-        run: |
-          source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --parallel
+          optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16
 
       - name: Run app
         working-directory: ${{ env.WORKING_DIRECTORY }}
@@ -95,37 +95,37 @@ jobs:
               Expand-Archive openvino_package.zip -DestinationPath ./tmp
               mv ./tmp/*/* .
             popd
-  
+
+        - name: Build app
+          working-directory: ${{ env.WORKING_DIRECTORY }}
+          run: |
+            . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
+            cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+            cmake --build ./build/ --config Release --parallel
+
         - name: Setup Python ${{ env.PYTHON_VERSION }}
           uses: actions/setup-python@v5
           with:
             python-version: ${{ env.PYTHON_VERSION }}
             cache: 'pip'
-          
+
         - name: Create virtual environment
           working-directory: ${{ env.WORKING_DIRECTORY }}
           run: python -m venv openvino_sd_cpp
-  
+
         - name: Install python dependencies
           working-directory: ${{ env.WORKING_DIRECTORY }}
           run: |
             . "./openvino_sd_cpp/Scripts/Activate.ps1"
             python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
             python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-  
+
         - name: Download and convert model and tokenizer
           working-directory: ${{ env.WORKING_DIRECTORY }}
           run: |
             . "./openvino_sd_cpp/Scripts/Activate.ps1"
-            optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16
-  
-        - name: Build app
-          working-directory: ${{ env.WORKING_DIRECTORY }}
-          run: |
-            . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-            cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-            cmake --build ./build/ --config Release --parallel
-  
+            optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16
+
         - name: Run app
           working-directory: ${{ env.WORKING_DIRECTORY }}
           run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a39ca7ed90..0f53505c20 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,18 +26,18 @@ if(POLICY CMP0169)
 endif()
 
 project(OpenVINOGenAI
-        VERSION 2024.4.0.0
+        VERSION 2024.5.0.0
         DESCRIPTION "OpenVINO GenAI"
         HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai"
         LANGUAGES CXX)
 
 # Find OpenVINODeveloperPackage first to compile with SDL flags
 find_package(OpenVINODeveloperPackage ${OpenVINOGenAI_VERSION} QUIET
-             COMPONENTS Runtime Threading
+             COMPONENTS Runtime
              PATHS "${OpenVINO_DIR}")
 if(NOT OpenVINODeveloperPackage_FOUND)
     find_package(OpenVINO ${OpenVINOGenAI_VERSION} REQUIRED
-                 COMPONENTS Runtime Threading)
+                 COMPONENTS Runtime)
 endif()
 
 include(cmake/features.cmake)
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/CMakeLists.txt b/image_generation/lcm_dreamshaper_v7/cpp/CMakeLists.txt
index e3ab524859..7e7680a393 100644
--- a/image_generation/lcm_dreamshaper_v7/cpp/CMakeLists.txt
+++ b/image_generation/lcm_dreamshaper_v7/cpp/CMakeLists.txt
@@ -15,7 +15,7 @@ set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake build type")
 
 # dependencies
 
-find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading)
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 
 include(FetchContent)
 
diff --git a/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt b/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt
index 0e3f140e14..77466668a4 100644
--- a/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt
+++ b/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt
@@ -15,7 +15,7 @@ set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake build type")
 
 # dependencies
 
-find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading)
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 
 include(FetchContent)
 
diff --git a/pyproject.toml b/pyproject.toml
index 1ea9c9b85f..a1ac58a6a5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openvino_genai"
-version = "2024.4.0.0"
+version = "2024.5.0.0"
 description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai"
 requires-python = ">=3.8"
 readme = {file = "src/README.md", content-type="text/markdown"}
@@ -16,7 +16,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "openvino_tokenizers~=2024.4.0.0.dev"
+    "openvino_tokenizers~=2024.5.0.0.dev"
 ]
 
 [tool.py-build-cmake.module]
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 018a7b2013..8fed89faa2 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 018a7b2013bb61ad5c0f62e80209b78734bbba60
+Subproject commit 8fed89faa2381841caa9e67e282684448758f12e

From 6a41498f4436cc0bdc3a77a9c6906c32177cf6e0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 4 Sep 2024 02:36:14 +0400
Subject: [PATCH 04/13] Bump diffusers from 0.30.1 to 0.30.2 (#815)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [diffusers](https://github.com/huggingface/diffusers) from 0.30.1
to 0.30.2.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/huggingface/diffusers/releases">diffusers's
releases</a>.</em></p>
<blockquote>
<h2>v0.30.2: Update from single file default repository</h2>
<h2>All commits</h2>
<ul>
<li>update runway repo for single_file by <a
href="https://github.com/yiyixuxu"><code>@​yiyixuxu</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/9323">#9323</a></li>
<li>Fix Flux CLIP prompt embeds repeat for num_images_per_prompt &gt; 1
by <a href="https://github.com/DN6"><code>@​DN6</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/9280">#9280</a></li>
<li>[IP Adapter] Fix cache_dir and local_files_only for image encoder by
<a href="https://github.com/asomoza"><code>@​asomoza</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/9272">#9272</a></li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/huggingface/diffusers/commit/f63c12633f154c2a1d79c17f4238fb073133652c"><code>f63c126</code></a>
Release: v0.30.2</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/be5995a8156d9d9967ec34abb30dfa6e0342c33d"><code>be5995a</code></a>
update runway repo for single_file (<a
href="https://redirect.github.com/huggingface/diffusers/issues/9323">#9323</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/065978474b2131dc578c21f132bd1a1d3407b894"><code>0659784</code></a>
Fix Flux CLIP prompt embeds repeat for num_images_per_prompt &gt; 1 (<a
href="https://redirect.github.com/huggingface/diffusers/issues/9280">#9280</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/cc1e589537701c780befc8c141a07fc6c1d46914"><code>cc1e589</code></a>
[IP Adapter] Fix <code>cache_dir</code> and
<code>local_files_only</code> for image encoder (<a
href="https://redirect.github.com/huggingface/diffusers/issues/9272">#9272</a>)</li>
<li>See full diff in <a
href="https://github.com/huggingface/diffusers/compare/v0.30.1...v0.30.2">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=diffusers&package-manager=pip&previous-version=0.30.1&new-version=0.30.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 image_generation/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/image_generation/requirements.txt b/image_generation/requirements.txt
index b53afc7b9a..bd5d3d677d 100644
--- a/image_generation/requirements.txt
+++ b/image_generation/requirements.txt
@@ -1,2 +1,2 @@
 -r ../samples/requirements.txt
-diffusers==0.30.1
+diffusers==0.30.2

From d30f62fd2f9cfd82c0258108496e155776a4ec51 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 4 Sep 2024 20:12:43 +0400
Subject: [PATCH 05/13] Fixed Android build (#809)

Related PRs:
- https://github.com/openvinotoolkit/openvino_tokenizers/pull/240
- https://github.com/jinja2cpp/Jinja2Cpp/pull/258
---
 samples/cpp/beam_search_causal_lm/CMakeLists.txt          | 8 +++++---
 samples/cpp/benchmark_genai/CMakeLists.txt                | 8 +++++---
 samples/cpp/chat_sample/CMakeLists.txt                    | 8 +++++---
 samples/cpp/greedy_causal_lm/CMakeLists.txt               | 8 +++++---
 samples/cpp/multinomial_causal_lm/CMakeLists.txt          | 8 +++++---
 samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt      | 8 +++++---
 .../prompt_lookup_decoding_lm.cpp                         | 2 +-
 samples/cpp/speculative_decoding_lm/CMakeLists.txt        | 8 +++++---
 .../speculative_decoding_lm/speculative_decoding_lm.cpp   | 2 +-
 src/cpp/CMakeLists.txt                                    | 3 +++
 10 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/samples/cpp/beam_search_causal_lm/CMakeLists.txt b/samples/cpp/beam_search_causal_lm/CMakeLists.txt
index 9ea4730528..9bf1a8aac8 100644
--- a/samples/cpp/beam_search_causal_lm/CMakeLists.txt
+++ b/samples/cpp/beam_search_causal_lm/CMakeLists.txt
@@ -1,9 +1,11 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-find_package(OpenVINOGenAI REQUIRED PATHS
-    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
-    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+find_package(OpenVINOGenAI REQUIRED
+    HINTS
+        "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+        ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+    NO_CMAKE_FIND_ROOT_PATH
 )
 
 add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt
index 3a05c37d62..902a05eee6 100644
--- a/samples/cpp/benchmark_genai/CMakeLists.txt
+++ b/samples/cpp/benchmark_genai/CMakeLists.txt
@@ -1,9 +1,11 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-find_package(OpenVINOGenAI REQUIRED PATHS
-    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
-    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+find_package(OpenVINOGenAI REQUIRED
+    PATHS
+        "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+        ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+    NO_CMAKE_FIND_ROOT_PATH
 )
 
 include(FetchContent)
diff --git a/samples/cpp/chat_sample/CMakeLists.txt b/samples/cpp/chat_sample/CMakeLists.txt
index 901f003d4c..69578dc86c 100644
--- a/samples/cpp/chat_sample/CMakeLists.txt
+++ b/samples/cpp/chat_sample/CMakeLists.txt
@@ -1,9 +1,11 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-find_package(OpenVINOGenAI REQUIRED PATHS
-    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
-    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+find_package(OpenVINOGenAI REQUIRED
+    PATHS
+        "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+        ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+    NO_CMAKE_FIND_ROOT_PATH
 )
 
 add_executable(chat_sample chat_sample.cpp)
diff --git a/samples/cpp/greedy_causal_lm/CMakeLists.txt b/samples/cpp/greedy_causal_lm/CMakeLists.txt
index 409733bbc6..ff5151676f 100644
--- a/samples/cpp/greedy_causal_lm/CMakeLists.txt
+++ b/samples/cpp/greedy_causal_lm/CMakeLists.txt
@@ -1,9 +1,11 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-find_package(OpenVINOGenAI REQUIRED PATHS
-    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
-    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+find_package(OpenVINOGenAI REQUIRED
+    PATHS
+        "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+        ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+    NO_CMAKE_FIND_ROOT_PATH
 )
 
 add_executable(greedy_causal_lm greedy_causal_lm.cpp)
diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
index 01b3bb3bb4..83b2335431 100644
--- a/samples/cpp/multinomial_causal_lm/CMakeLists.txt
+++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
@@ -1,9 +1,11 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-find_package(OpenVINOGenAI REQUIRED PATHS
-    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
-    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+find_package(OpenVINOGenAI REQUIRED
+    PATHS
+        "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+        ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+    NO_CMAKE_FIND_ROOT_PATH
 )
 
 add_executable(multinomial_causal_lm multinomial_causal_lm.cpp)
diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
index 9b7a15131d..c899c6e47b 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
@@ -3,9 +3,11 @@
 
 find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading)
 
-find_package(OpenVINOGenAI REQUIRED PATHS
-    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
-    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+find_package(OpenVINOGenAI REQUIRED
+    PATHS
+        "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+        ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+    NO_CMAKE_FIND_ROOT_PATH
 )
 
 add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp)
diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
index 3419f3221a..5e372a3f09 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
+++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
@@ -238,7 +238,7 @@ int main(int argc, char* argv[]) try {
     ov::Tensor position_ids = model.get_tensor("position_ids");
     position_ids.set_shape(input_ids.get_shape());
     std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-    uint64_t seq_len = input_ids.get_shape()[1];
+    size_t seq_len = input_ids.get_shape()[1];
 
     // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
     model.get_tensor("beam_idx").set_shape({BATCH_SIZE});
diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
index 1a9b02f1b2..078ac8bb52 100644
--- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
@@ -3,9 +3,11 @@
 
 find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading)
 
-find_package(OpenVINOGenAI REQUIRED PATHS
-    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
-    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+find_package(OpenVINOGenAI REQUIRED
+    PATHS
+        "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+        ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+    NO_CMAKE_FIND_ROOT_PATH
 )
 
 add_executable(speculative_decoding_lm speculative_decoding_lm.cpp)
diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index de2d2f8837..f26cb6c7c4 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -272,7 +272,7 @@ int main(int argc, char* argv[]) try {
 
     ov::InferRequest draft_model = core.compile_model(ov_draft_model, "CPU").create_infer_request();
 
-    uint64_t seq_len = input_ids.get_shape()[1];
+    size_t seq_len = input_ids.get_shape()[1];
 
     // main model (which is bigger, more accurate but slower)
     std::shared_ptr<ov::Model> ov_main_model = core.read_model(std::string{argv[2]} + "/openvino_model.xml");
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 56e19fbd9f..626c4a7903 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -31,6 +31,9 @@ function(ov_genai_build_jinja2cpp)
         set(JINJA2CPP_STRICT_WARNINGS OFF CACHE BOOL "")
         set(JINJA2CPP_PIC ON CACHE BOOL "")
 
+        # TMP WA:
+        set(RapidJSON_DIR "${CMAKE_BINARY_DIR}/_deps/rapidjson-build")
+
         # options for Jinja2Cpp dependencies
         option(RAPIDJSON_BUILD_DOC "Build rapidjson documentation." OFF)
 

From 0a54dc9db11d476ea008107189d88322ef61a452 Mon Sep 17 00:00:00 2001
From: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
Date: Wed, 4 Sep 2024 21:49:54 +0300
Subject: [PATCH 06/13] [Port] Change Stable Diffusion v1.5 model (#825)

This is a port of #824 to master branch

[RunwayML](https://huggingface.co/runwayml) is no longer maintaining a
HuggingFace organization so `runwayml/stable-diffusion-v1-5` model is
not available for downloading.
Replace it with a re-uploaded archive copy
[`botp/stable-diffusion-v1-5`](https://huggingface.co/botp/stable-diffusion-v1-5)
---
 .github/workflows/stable_diffusion_1_5_cpp.yml      | 4 ++--
 image_generation/stable_diffusion_1_5/cpp/README.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index ec440abb52..970fbb4159 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -69,7 +69,7 @@ jobs:
         working-directory: ${{ env.WORKING_DIRECTORY }}
         run: |
           source openvino_sd_cpp/bin/activate
-          optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16
+          optimum-cli export openvino --model botp/stable-diffusion-v1-5 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16
 
       - name: Run app
         working-directory: ${{ env.WORKING_DIRECTORY }}
@@ -124,7 +124,7 @@ jobs:
           working-directory: ${{ env.WORKING_DIRECTORY }}
           run: |
             . "./openvino_sd_cpp/Scripts/Activate.ps1"
-            optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16
+            optimum-cli export openvino --model botp/stable-diffusion-v1-5 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16
 
         - name: Run app
           working-directory: ${{ env.WORKING_DIRECTORY }}
diff --git a/image_generation/stable_diffusion_1_5/cpp/README.md b/image_generation/stable_diffusion_1_5/cpp/README.md
index 57cf80d4fa..144f5a0552 100644
--- a/image_generation/stable_diffusion_1_5/cpp/README.md
+++ b/image_generation/stable_diffusion_1_5/cpp/README.md
@@ -57,7 +57,7 @@ The path to the OpenVINO install directory is referred as `<INSTALL_DIR>` throug
 2. Download the model from Huggingface and convert it to OpenVINO IR via [optimum-intel CLI](https://github.com/huggingface/optimum-intel).
 
     Example models to download:
-    - [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)
+    - [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5)
     - [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0)
 
     Example command for downloading [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) model and exporting it with FP16 precision:

From 179de15ae426e1d47446558f8599f277412d38cf Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Thu, 5 Sep 2024 17:22:36 +0400
Subject: [PATCH 07/13] use perf metrics genai in llm_bench (#818)

Update the way how metrics are collected in the llm-bench

CVS-151502
https://github.com/openvinotoolkit/openvino.genai/pull/830
---
 llm_bench/python/benchmark.py                | 30 ++++++++---------
 llm_bench/python/llm_bench_utils/ov_utils.py | 35 ++------------------
 2 files changed, 15 insertions(+), 50 deletions(-)

diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
index 6fc5e28b08..321441364d 100644
--- a/llm_bench/python/benchmark.py
+++ b/llm_bench/python/benchmark.py
@@ -224,11 +224,6 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
             llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
     pt_inputs = tokenizer(input_text_list, return_tensors="pt")
     input_token_size = pt_inputs.input_ids.shape[1]
-    pipe_tokenizer = model.get_tokenizer()
-    tok_encode_start = time.perf_counter()
-    input_data = pipe_tokenizer.encode(input_text_list)
-    tok_encode_end = time.perf_counter()
-    tok_encode_time = (tok_encode_end - tok_encode_start) * 1000
     if args['batch_size'] > 1:
         out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
         out_str += " Batch_size={}, ".format(args['batch_size'])
@@ -243,21 +238,19 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
     if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
         mem_consumption.start_collect_memory_consumption()
     max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
-    streamer.reset()
     start = time.perf_counter()
-    generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).tokens
+    generation_result = model.generate(input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"])
     end = time.perf_counter()
-    log.info(type(generated_tokens[0]))
+    generated_text = generation_result.texts
+    perf_metrics = generation_result.perf_metrics
+
     if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
         mem_consumption.end_collect_momory_consumption()
         max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption()
         mem_consumption.clear_max_memory_consumption()
 
     generation_time = end - start
-    tok_decode_start = time.perf_counter()
-    generated_text = pipe_tokenizer.decode(generated_tokens)
-    tok_decode_end = time.perf_counter()
-    tok_decode_time = (tok_decode_end - tok_decode_start) * 1000
+    generated_tokens = [tokenizer(text).input_ids for text in generated_text]
     # Only text_gen need to minus length of input_data, because generated_text may include input_text
     num_tokens = 0
     result_md5_list = []
@@ -275,9 +268,13 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
     else:
         md5_list[num][prompt_index] = result_md5_list
     per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
-    tm_list = streamer.get_time_list()
+    tm_list = np.array(perf_metrics.raw_metrics.m_durations) / 1000 / 1000
     log.debug('latency of all tokens:')
     [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
+    tokenization_time = (
+        np.mean(perf_metrics.raw_metrics.tokenization_durations) / 1000,
+        np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000
+    )
     iter_data = gen_iterate_data(
         num,
         input_token_size * args['batch_size'],
@@ -290,19 +287,19 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
         max_shared_mem=max_shared_mem_consumption,
         max_uss_mem=max_uss_mem_consumption,
         prompt_idx=prompt_index,
-        tokenization_time=(tok_encode_time, tok_decode_time)
+        tokenization_time=tokenization_time
     )
     iter_data_list.append(iter_data)
     llm_bench_utils.metrics_print.print_metrics(
         num,
         iter_data,
-        tm_list,
+        tm_list.tolist(),
         [],
         warm_up=(num == 0),
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,
         max_uss_mem=max_uss_mem_consumption,
-        tokenization_time=(tok_encode_time, tok_decode_time),
+        tokenization_time=tokenization_time,
         batch_size=args['batch_size']
     )
     if num > 0:
@@ -320,7 +317,6 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
                 assert (result_md5_list == prev_md5)
     else:
         llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
-    streamer.reset()
 
 
 def run_text_generation_benchmark(model_path, framework, device, args, num_iters):
diff --git a/llm_bench/python/llm_bench_utils/ov_utils.py b/llm_bench/python/llm_bench_utils/ov_utils.py
index cd91991a53..1f9c6b6a31 100644
--- a/llm_bench/python/llm_bench_utils/ov_utils.py
+++ b/llm_bench/python/llm_bench_utils/ov_utils.py
@@ -144,9 +144,7 @@ def create_text_gen_model(model_path, device, **kwargs):
         raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
     else:
         if kwargs.get("genai", False) and is_genai_available(log_msg=True):
-            if kwargs["batch_size"] > 1 or kwargs["num_beams"] > 1:
-                log.warning("OpenVINO GenAI based benchmarking implmented only for batch_size == 1 and num_beams == 1")
-            elif model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"]]:
+            if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"]]:
                 log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking")
             else:
                 return create_genai_text_gen_model(model_path, device, ov_config, **kwargs)
@@ -183,34 +181,6 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs):
     import openvino_genai
     from transformers import AutoTokenizer
 
-    class TokenStreamer(openvino_genai.StreamerBase):
-        def __init__(self, tokenizer):
-            openvino_genai.StreamerBase.__init__(self)
-            self.tokenizer = tokenizer
-            self.token_generation_time = []
-            self.generated_tokens = []
-            self.start_time = time.perf_counter()
-
-        def put(self, token_id):
-            self.token_generation_time.append(time.perf_counter() - self.start_time)
-            self.generated_tokens.append(token_id)
-            self.start_time = time.perf_counter()
-            return False
-
-        def reset(self):
-            self.token_generation_time = []
-            self.generated_tokens = []
-            self.start_time = time.perf_counter()
-
-        def end(self):
-            pass
-
-        def get_tokens(self):
-            return self.generated_tokens
-
-        def get_time_list(self):
-            return self.token_generation_time
-
     if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists():
         convert_ov_tokenizer(model_path)
 
@@ -220,9 +190,8 @@ def get_time_list(self):
     llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper(), ov_config)
     end = time.perf_counter()
     log.info(f'Pipeline initialization time: {end - start:.2f}s')
-    streamer = TokenStreamer(llm_pipe.get_tokenizer())
 
-    return llm_pipe, tokenizer, end - start, streamer, True
+    return llm_pipe, tokenizer, end - start, None, True
 
 
 def convert_ov_tokenizer(tokenizer_path):

From ecafec310fbdf92f0ef5fcef117dd981f0b207ab Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Thu, 5 Sep 2024 14:24:34 +0200
Subject: [PATCH 08/13] Fix perf metrics  (#829)

Port https://github.com/openvinotoolkit/openvino.genai/pull/823 to
master

CVS-151497

---------

Co-authored-by: Artur Paniukov <chgk1101@gmail.com>
---
 .../include/openvino/genai/perf_metrics.hpp   | 19 ++---
 src/cpp/src/llm_pipeline.cpp                  |  4 +-
 src/cpp/src/perf_metrics.cpp                  |  9 +-
 src/python/openvino_genai/__init__.py         | 16 ++--
 src/python/py_generate_pipeline.cpp           | 37 ++++----
 tests/python_tests/test_generate_api.py       | 84 +++++++++++++++++++
 6 files changed, 131 insertions(+), 38 deletions(-)

diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
index ad53d8d941..f256a123de 100644
--- a/src/cpp/include/openvino/genai/perf_metrics.hpp
+++ b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -37,9 +37,6 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
     std::vector<TimePoint> m_new_token_times;
     std::vector<size_t> m_batch_sizes;
     std::vector<MicroSeconds> m_durations;
-
-    size_t num_generated_tokens;
-    size_t num_input_tokens;
 };
 
 /**
@@ -111,15 +108,15 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
     size_t num_input_tokens;
     
     float get_load_time();         // Load time in ms.
-    float get_num_generated_tokens();
-    float get_num_input_tokens();
-    MeanStdPair get_ttft();         // Time to the first token (in ms) (TTTFT).
-    MeanStdPair get_tpot();        // Time (in ms) per output token (TPOT).
-    MeanStdPair get_throughput();  // Tokens per second.
+    size_t get_num_generated_tokens();
+    size_t get_num_input_tokens();
+    MeanStdPair get_ttft();         // Time to the first token (in ms) (TTFT).
+    MeanStdPair get_tpot();         // Time (in ms) per output token (TPOT).
+    MeanStdPair get_throughput();   // Tokens per second.
     
-    MeanStdPair get_generate_duration();
-    MeanStdPair get_tokenization_duration();
-    MeanStdPair get_detokenization_duration();
+    MeanStdPair get_generate_duration();        // in ms
+    MeanStdPair get_tokenization_duration();    // in ms
+    MeanStdPair get_detokenization_duration();  // in ms
 
     // Flag indicating if raw metrics were evaluated.
     // If false means current mean/std ttft, tpot, etc. are not actual 
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index d89d63faa9..66e2890671 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -171,7 +171,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
         raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
         raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
-
+        
+        // Added tokenization/detokenization times, and updated generate duration, need to reevaluate statistics.
+        decoded_results.perf_metrics.m_evaluated = false;
         decoded_results.perf_metrics.evaluate_statistics(start_time);
         return decoded_results;
     }
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index 2f378ab302..9ed0233232 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -10,6 +10,9 @@
 namespace {
 
 ov::genai::MeanStdPair calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) {
+    if (durations.size() == 0) {
+        return {-1, -1};
+    }
     // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds.
     float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, 
         [](const float& acc, const ov::genai::MicroSeconds& duration) -> float {
@@ -36,14 +39,14 @@ float PerfMetrics::get_load_time() {
     return load_time;
 }
 
-float PerfMetrics::get_num_generated_tokens() {
+size_t PerfMetrics::get_num_generated_tokens() {
     evaluate_statistics();
     return num_generated_tokens;
 }
 
-float PerfMetrics::get_num_input_tokens() {
+size_t PerfMetrics::get_num_input_tokens() {
     evaluate_statistics();
-    return num_generated_tokens;
+    return num_input_tokens;
 }
 
 MeanStdPair PerfMetrics::get_ttft() {
diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py
index 9e01068972..6c7bbf39ef 100644
--- a/src/python/openvino_genai/__init__.py
+++ b/src/python/openvino_genai/__init__.py
@@ -12,15 +12,17 @@
     os.add_dll_directory(os.path.dirname(__file__))
 
 from .py_generate_pipeline import (
-    LLMPipeline, 
-    Tokenizer, 
-    GenerationConfig, 
-    TokenizedInputs,
+    ContinuousBatchingPipeline,
     DecodedResults, 
     EncodedResults, 
-    StreamerBase, 
-    StopCriteria,
-    ContinuousBatchingPipeline,
+    GenerationConfig, 
     GenerationResult,
+    LLMPipeline, 
+    PerfMetrics,
+    RawPerfMetrics,
     SchedulerConfig,
+    StopCriteria,
+    StreamerBase, 
+    TokenizedInputs,
+    Tokenizer
 )
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index aada9b0939..225257f0d9 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -247,22 +247,22 @@ auto perf_metrics_docstring = R"(
     :param get_num_input_tokens: Returns the number of tokens in the input prompt.
     :type get_num_input_tokens: int
 
-    :param get_ttft: Returns the mean and standard deviation of TTFT.
+    :param get_ttft: Returns the mean and standard deviation of TTFT in milliseconds.
     :type get_ttft: MeanStdPair
 
-    :param get_tpot: Returns the mean and standard deviation of TPOT.
+    :param get_tpot: Returns the mean and standard deviation of TPOT in milliseconds.
     :type get_tpot: MeanStdPair
 
-    :param get_throughput: Returns the mean and standard deviation of throughput.
+    :param get_throughput: Returns the mean and standard deviation of throughput in tokens per second.
     :type get_throughput: MeanStdPair
 
-    :param get_generate_duration: Returns the mean and standard deviation of generate duration.
+    :param get_generate_duration: Returns the mean and standard deviation of generate durations in milliseconds.
     :type get_generate_duration: MeanStdPair
 
-    :param get_tokenization_duration: Returns the mean and standard deviation of tokenization duration.
+    :param get_tokenization_duration: Returns the mean and standard deviation of tokenization durations in milliseconds.
     :type get_tokenization_duration: MeanStdPair
 
-    :param get_detokenization_duration: Returns the mean and standard deviation of detokenization duration.
+    :param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds.
     :type get_detokenization_duration: MeanStdPair
 
     :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics.
@@ -763,10 +763,12 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
 
     py::class_<RawPerfMetrics>(m, "RawPerfMetrics", raw_perf_metrics_docstring)
         .def(py::init<>())
-        .def_readonly("generate_durations", &RawPerfMetrics::generate_durations)
+        .def_property_readonly("generate_durations", [](const RawPerfMetrics &rw) {
+            return get_ms(rw, &RawPerfMetrics::generate_durations);
+        })
         .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { 
             return get_ms(rw, &RawPerfMetrics::tokenization_durations);
-         })
+        })
         .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { 
             return get_ms(rw, &RawPerfMetrics::detokenization_durations); 
         })
@@ -776,24 +778,27 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) { 
             return get_ms(rw, &RawPerfMetrics::m_durations); 
         })
-        .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes)
-        .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens)
-        .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens);
+        .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes);
 
     py::class_<MeanStdPair>(m, "MeanStdPair")
         .def(py::init<>())
         .def_readonly("mean", &MeanStdPair::mean)
-        .def_readonly("std", &MeanStdPair::std);
+        .def_readonly("std", &MeanStdPair::std)
+        .def("__iter__", [](const MeanStdPair &self) {
+            return py::make_iterator(&self.mean, &self.std + 1);
+        }, py::keep_alive<0, 1>());  // Keep object alive while the iterator is used;
 
     py::class_<PerfMetrics>(m, "PerfMetrics", perf_metrics_docstring)
         .def(py::init<>())
+        .def("get_load_time", &PerfMetrics::get_load_time)
+        .def("get_num_generated_tokens", &PerfMetrics::get_num_generated_tokens)
+        .def("get_num_input_tokens", &PerfMetrics::get_num_input_tokens)
+        .def("get_ttft", &PerfMetrics::get_ttft)
+        .def("get_tpot", &PerfMetrics::get_tpot)
+        .def("get_throughput", &PerfMetrics::get_throughput)
         .def("get_generate_duration", &PerfMetrics::get_generate_duration)
         .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration)
         .def("get_detokenization_duration", &PerfMetrics::get_detokenization_duration)
-        .def("get_throughput", &PerfMetrics::get_throughput)
-        .def("get_tpot", &PerfMetrics::get_tpot)
-        .def("get_ttft", &PerfMetrics::get_ttft)
-        .def("get_load_time", &PerfMetrics::get_load_time)
         .def("__add__", &PerfMetrics::operator+)
         .def("__iadd__", &PerfMetrics::operator+=)
         .def_readonly("raw_metrics", &PerfMetrics::raw_metrics);
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 9a02f506bb..f80729d425 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -723,3 +723,87 @@ def test_cb_streamer_vs_return_vs_stateful(prompt):
     reference = stateful.generate(prompt, max_new_tokens=20)
     assert generated == "".join(streamed)
     assert "".join(streamed) == reference
+
+def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics:
+    model_id, path, tokenizer, model, pipe = model_descr
+
+    config = generation_config.copy()  # to avoid side effects
+
+    if 'do_sample' not in config:
+        # Some HF models have default do_sample = True, and if we set beam search generation config 
+        # it conflicts with `diversity_penalty` and/or `num_beam_groups`.
+        # Need to set explicitly to False, but only if test arguments omitted this arg.
+        # Do not apply 'repetition_penalty' if sampling is not used.
+        config['do_sample'] = False
+        config['repetition_penalty'] = None
+    return pipe.generate([prompt], **config).perf_metrics
+
+
+test_cases = [
+    (dict(max_new_tokens=20), 'table is made of'),
+]
+@pytest.mark.parametrize("generation_config,prompt", test_cases)
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_perf_metrics(model_descr, generation_config, prompt):
+    import time
+    start_time = time.perf_counter()
+    perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt)
+    total_time = (time.perf_counter() - start_time) * 1000
+    
+    # Check that load time is adequate.
+    load_time = perf_metrics.get_load_time()
+    assert load_time > 0 and load_time < 1000.0  
+    
+    # Check that num input and generated tokens are adequate.
+    num_generated_tokens = perf_metrics.get_num_generated_tokens()
+    assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens']  
+    
+    num_input_tokens = perf_metrics.get_num_input_tokens()
+    assert num_input_tokens > 0 and num_input_tokens <= len(prompt)
+
+    mean_ttft, std_ttft = perf_metrics.get_ttft()
+    assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std)
+    assert mean_ttft > 0 and mean_ttft < 1000.0
+
+    mean_tpot, std_tpot = perf_metrics.get_tpot()
+    assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std)
+    assert mean_tpot > 0 and mean_ttft < 1000.0
+
+    mean_throughput, std_throughput = perf_metrics.get_throughput()
+    assert (mean_throughput, std_throughput) == (perf_metrics.get_throughput().mean, perf_metrics.get_throughput().std)
+    assert mean_throughput > 0 and mean_throughput < 20000.0
+    
+    mean_gen_duration, std_gen_duration = perf_metrics.get_generate_duration()
+    assert (mean_gen_duration, std_gen_duration) == (perf_metrics.get_generate_duration().mean, perf_metrics.get_generate_duration().std)
+    assert mean_gen_duration > 0 and load_time + mean_gen_duration < total_time
+    assert std_gen_duration == 0
+
+    mean_tok_duration, std_tok_duration = perf_metrics.get_tokenization_duration()
+    assert (mean_tok_duration, std_tok_duration) == (perf_metrics.get_tokenization_duration().mean, perf_metrics.get_tokenization_duration().std)
+    assert mean_tok_duration > 0 and mean_tok_duration < mean_gen_duration
+    assert std_tok_duration == 0
+
+    mean_detok_duration, std_detok_duration = perf_metrics.get_detokenization_duration()
+    assert (mean_detok_duration, std_detok_duration) == (perf_metrics.get_detokenization_duration().mean, perf_metrics.get_detokenization_duration().std)
+    assert mean_detok_duration > 0 and mean_detok_duration < mean_gen_duration
+    assert std_detok_duration == 0
+    
+    # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics
+    raw_metrics = perf_metrics.raw_metrics
+    raw_dur = np.array(raw_metrics.generate_durations) / 1000
+    assert np.allclose(mean_gen_duration, np.mean(raw_dur))
+    assert np.allclose(std_gen_duration, np.std(raw_dur))
+
+    raw_dur = np.array(raw_metrics.tokenization_durations) / 1000
+    assert np.allclose(mean_tok_duration, np.mean(raw_dur))
+    assert np.allclose(std_tok_duration, np.std(raw_dur))
+
+    raw_dur = np.array(raw_metrics.detokenization_durations) / 1000
+    assert np.allclose(mean_detok_duration, np.mean(raw_dur))
+    assert np.allclose(std_detok_duration, np.std(raw_dur))
+
+    assert len(raw_metrics.m_times_to_first_token) > 0
+    assert len(raw_metrics.m_batch_sizes) > 0
+    assert len(raw_metrics.m_durations) > 0

From 72730a436bdde885a284c511f2a99839d539ef9b Mon Sep 17 00:00:00 2001
From: lamiayous <124199862+lamiayous@users.noreply.github.com>
Date: Fri, 6 Sep 2024 04:53:07 +0100
Subject: [PATCH 09/13] changing dimensions of batch size, kv cache and
 num_input_heads (#793)

Once KV-cache tensors are exposed from the stateful model, they should
be reshaped to have static size. Current implementation of reshape
function assumes that KV-cache dimension is always equal to 2 and batch
dimension always equal to 0. For chatglm and Qwen this is not the case.
This PR identifies the KV-cache and batch dimensions by reading the
models config.json file

---------

Co-authored-by: Zlobin Vladimir <vladimir.zlobin@intel.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 src/cpp/src/llm_pipeline_static.cpp | 49 +++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 71a76bc922..c7039a5901 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -9,6 +9,8 @@
 #include "utils.hpp"
 
 #include <openvino/pass/stateful_to_stateless.hpp>
+#include <jinja2cpp/user_callable.h>
+#include <fstream>
 
 namespace {
 
@@ -83,9 +85,39 @@ std::shared_ptr<ov::Model> add_slices_to_kvcache_inputs(const std::shared_ptr<ov
     return std::make_shared<ov::Model>(model->get_results(), ov::SinkVector{}, new_params);
 }
 
+struct KVAxesPosition {
+    uint32_t batch;
+    uint32_t seq_len;
+};
+
+KVAxesPosition get_kv_axes(const std::string& model_type) {
+    KVAxesPosition axes;
+    if (model_type == "chatglm") {
+        axes.batch = 1u;
+        axes.seq_len = 0u;
+    } else if (model_type == "qwen") {
+        // Note, qwen2 does not fall into this category and conforms to default layout
+        axes.batch = 0u;
+        axes.seq_len = 1u;
+    } else {
+        axes.batch = 0u;
+        axes.seq_len = 2u;
+    }
+    return axes;
+}
+
+std::string get_model_type_from_json(const std::filesystem::path& filepath) {
+    std::ifstream file(filepath);
+    OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string());
+    nlohmann::json config_data = nlohmann::json::parse(file);
+    std::string model_type = config_data["model_type"].get<std::string>();
+    return model_type;
+}
+
 void reshape_to_static(std::shared_ptr<ov::Model> model,
                        const uint32_t input_size,
-                       const uint32_t kvcache_size) {
+                       const uint32_t kvcache_size,
+                       const KVAxesPosition& kv_axes_position) {
     std::map<std::string, ov::PartialShape> new_shapes;
     for (auto input : model->inputs()) {
         const auto& input_name = input.get_any_name();
@@ -98,10 +130,9 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
             new_shape = ov::PartialShape({1, input_size});
         } else {
             const auto& partial_shape = input.get_partial_shape();
-            new_shape = ov::PartialShape({1,
-                                          partial_shape[1].get_length(),
-                                          kvcache_size-input_size,
-                                          partial_shape[3].get_length()});
+            new_shape = partial_shape;
+            new_shape[kv_axes_position.batch] = 1;
+            new_shape[kv_axes_position.seq_len] = kvcache_size - input_size;
         }
         new_shapes.emplace(input_name, new_shape);
     }
@@ -222,10 +253,10 @@ StaticLLMPipeline::StaticLLMPipeline(
     // (6) Reshape both models to static shape
     const auto kMaxPromptLen = pop_or_default(pipeline_config, "MAX_PROMPT_LEN", 1024u);
     const auto kMinResponseLen = pop_or_default(pipeline_config, "MIN_RESPONSE_LEN", 150u);
-    // FIXME For some models KV-cache dim != 2u
-    m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, 2u };
-    reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size);
-    reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size);
+    KVAxesPosition axes = get_kv_axes(get_model_type_from_json(path / "config.json"));
+    m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len };
+    reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
+    reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes);
     // (7) Compile both model
     auto prefill_config = pop_or_default(pipeline_config, "PREFILL_CONFIG", get_default_prefill_config());
     auto generate_config = pop_or_default(pipeline_config, "GENERATE_CONFIG", get_default_generate_config());

From e0a0c85b8ab20c90f8daea1fde07f16e7660bead Mon Sep 17 00:00:00 2001
From: guweixin <weixin.gu@intel.com>
Date: Fri, 6 Sep 2024 15:55:30 +0800
Subject: [PATCH 10/13] Add perf metrics for Static LLM Pipeline (#808)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I found that openvino_genai.PerfMetrics are not available for NPU and it
is not possible to accurately get the LLM performance on NPU. I added
part of the code for this.
Before：
<img width="857" alt="image"
src="https://github.com/user-attachments/assets/70b08d7d-0980-4876-a37e-c91433fa32df">
After:
<img width="854" alt="image"
src="https://github.com/user-attachments/assets/492eea0a-117b-4bc8-b742-2fcda56687df">
---
 src/cpp/src/llm_pipeline_static.cpp | 57 +++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index c7039a5901..002323dd46 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -307,28 +307,46 @@ DecodedResults StaticLLMPipeline::generate(
     OptionalGenerationConfig generation_config,
     StreamerVariant streamer
 ) {
+    auto start_time = std::chrono::steady_clock::now();
     GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
-    if (std::holds_alternative<std::vector<std::string>>(inputs)) {
-        OPENVINO_THROW("Currently only batch size=1 is supported");
-    }
-
-    OPENVINO_ASSERT(std::holds_alternative<std::string>(inputs));
-    auto& prompt = std::get<std::string>(inputs);
-
-    if (m_is_chat_conversation) {
-        m_history.push_back({{"role", "user"}, {"content", prompt}});
-        constexpr bool add_generation_prompt = true;
-        prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+    TokenizedInputs tokenized_input;
+    if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
+        // OPENVINO_ASSERT(!m_is_chat_conversation, "Can't chat with multiple prompts");
+        auto& strings = std::get<std::vector<std::string>>(inputs);
+        if (strings.size() != 1) {
+            OPENVINO_THROW("Currently only batch size=1 is supported");
+        } else {
+            tokenized_input = m_tokenizer.encode(*input_vector);
+        }
+    } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
+        std::string& prompt = *input_prompt;
+        if (m_is_chat_conversation) {
+            m_history.push_back({{"role", "user"}, {"content", prompt}});
+            constexpr bool add_generation_prompt = true;
+            prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+        }
+        tokenized_input = m_tokenizer.encode(prompt);
     }
 
-    auto tokenized_input = m_tokenizer.encode(prompt);
+    auto encode_stop_time =  std::chrono::steady_clock::now();
     auto encoded_results = generate(tokenized_input, config, streamer);
+    auto decode_start_time =  std::chrono::steady_clock::now();
     DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
 
+    auto decode_stop_time =  std::chrono::steady_clock::now();
     if (m_is_chat_conversation) {
         auto answer = decoded_results.texts[0];
         m_history.push_back({{"role", "assistant"}, {"content", answer}});
     }
+    // generate_durations
+    decoded_results.perf_metrics = encoded_results.perf_metrics;
+    auto& raw_counters = decoded_results.perf_metrics.raw_metrics;
+    auto stop_time = std::chrono::steady_clock::now();
+    raw_counters.generate_durations = std::vector<MicroSeconds>();
+    raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time));
+    raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time));
+    decoded_results.perf_metrics.evaluate_statistics(start_time);
     return decoded_results;
 }
 
@@ -337,6 +355,7 @@ EncodedResults StaticLLMPipeline::generate(
     OptionalGenerationConfig generation_config,
     StreamerVariant streamer
 ) {
+    auto start_time = std::chrono::steady_clock::now();
     ov::Tensor input_ids;
     ov::Tensor attention_mask;
 
@@ -371,7 +390,10 @@ EncodedResults StaticLLMPipeline::generate(
         OPENVINO_THROW("Currently only greedy decoding is supported");
     }
 
+    ov::Shape prompts_shape = input_ids.get_shape();
+    const size_t batch_size = prompts_shape[0];
     ov::genai::EncodedResults results;
+    auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     // NB: Only batch=1 is supported now
     results.scores.resize(1u);
     results.scores[0] = 0u;
@@ -401,6 +423,8 @@ EncodedResults StaticLLMPipeline::generate(
     std::iota(padded_pos_data + offset, padded_pos_data + padded_position_ids.get_size(), 0u);
 
     m_prefill_request.infer();
+    raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+    raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += prompt_len;
@@ -454,6 +478,8 @@ EncodedResults StaticLLMPipeline::generate(
         last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
         results.tokens[0].push_back(last_token);
 
+        raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+        raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
         if (streamer_ptr && streamer_ptr->put(last_token)) {
             break;
         }
@@ -478,6 +504,13 @@ EncodedResults StaticLLMPipeline::generate(
             m_kvcache_request.get_tensor(output_name).copy_to(kvcache_in_slice);
         }
     }
+    auto stop_time = std::chrono::steady_clock::now();
+    // If is called without tokenization then that stat will not be reported.
+    auto& metrics = results.perf_metrics;
+    metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
+    metrics.load_time = this->m_load_time_ms;
+    metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    metrics.evaluate_statistics(start_time);
     return results;
 }
 

From 2eb9cbd83770ae85e264282321a0b327509bfcf0 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 4 Sep 2024 20:49:01 +0200
Subject: [PATCH 11/13] Updated GenAI

---
 .github/workflows/causal_lm_cpp.yml | 1 -
 src/cpp/CMakeLists.txt              | 7 ++-----
 thirdparty/openvino_tokenizers      | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 62de9fc671..319bd3384b 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -554,7 +554,6 @@ jobs:
 
   cpp-chat_sample-ubuntu:
     runs-on: ubuntu-20.04
-    if: false  # Skip temporarily until https://github.com/openvinotoolkit/openvino_tokenizers/pull/235 is merged.
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 626c4a7903..7bdc7cd863 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -15,8 +15,8 @@ endif()
 
 function(ov_genai_build_jinja2cpp)
     FetchContent_Declare(jinja2cpp
-        URL https://github.com/jinja2cpp/Jinja2Cpp/archive/bcc0e30f17f17f738ec2a7a31316d6efbe78a0e0.tar.gz
-        URL_HASH SHA256=f76547deb323240e7d181ecda6f658757ea5eb07ce772cf39e8bd20467412164)
+        URL https://github.com/jinja2cpp/Jinja2Cpp/archive/b32fbde7d98d13c34784c332c4a24a6f92c76e38.tar.gz
+        URL_HASH SHA256=7cc25ddbc438a5c874d404e100b4eccd8a331c195417f5487c48aebcf4b9e7fb)
 
     FetchContent_GetProperties(jinja2cpp)
     if(NOT jinja2cpp_POPULATED)
@@ -31,9 +31,6 @@ function(ov_genai_build_jinja2cpp)
         set(JINJA2CPP_STRICT_WARNINGS OFF CACHE BOOL "")
         set(JINJA2CPP_PIC ON CACHE BOOL "")
 
-        # TMP WA:
-        set(RapidJSON_DIR "${CMAKE_BINARY_DIR}/_deps/rapidjson-build")
-
         # options for Jinja2Cpp dependencies
         option(RAPIDJSON_BUILD_DOC "Build rapidjson documentation." OFF)
 
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 8fed89faa2..cb79143c51 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 8fed89faa2381841caa9e67e282684448758f12e
+Subproject commit cb79143c51b426915c23b088185cf4e9bbf3ff8c

From fe7bfd0bbb05ca50c16ceb05ded78f5ed95853a6 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 5 Sep 2024 14:57:25 +0200
Subject: [PATCH 12/13] Use tarballs

---
 .github/workflows/causal_lm_cpp.yml | 117 ++++++++++++++++++++++++----
 CMakeLists.txt                      |   4 +
 src/cpp/src/tokenizers_path.cpp     |  11 ++-
 thirdparty/openvino_tokenizers      |   2 +-
 4 files changed, 114 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 319bd3384b..b19dbbe06d 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -19,6 +19,9 @@ env:
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -31,11 +34,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
@@ -62,6 +68,10 @@ jobs:
             python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py,
           ]
     runs-on: ubuntu-20.04
+    if: ${{ false }} # fails because of UNICODE output
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -74,11 +84,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
@@ -210,11 +223,14 @@ jobs:
       - run: unzip -d ov ov.zip
       - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
         shell: bash
-      - name: Download, convert and build
+      - name: Build app
         run: |
           call .\ov\setupvars.bat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert model
+        run: |
+          call .\ov\setupvars.bat
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
@@ -243,6 +259,9 @@ jobs:
 
   cpp-beam_search_causal_lm-Qwen-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -255,11 +274,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
@@ -270,6 +292,9 @@ jobs:
 
   cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -282,11 +307,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
@@ -298,6 +326,9 @@ jobs:
 
   cpp-beam_search_causal_lm-Phi-2:
     runs-on: ubuntu-20.04-16-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -310,11 +341,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
@@ -326,6 +360,9 @@ jobs:
 
   cpp-beam_search_causal_lm-notus-7b-v1:
     runs-on: ubuntu-20.04-16-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -338,11 +375,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
@@ -354,6 +394,9 @@ jobs:
 
   cpp-speculative_decoding_lm-ubuntu:
     runs-on: ubuntu-20.04-16-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -366,11 +409,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
@@ -391,6 +437,9 @@ jobs:
 
   cpp-prompt_lookup_decoding_lm-ubuntu:
     runs-on: ubuntu-20.04-16-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -403,11 +452,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
@@ -458,6 +510,9 @@ jobs:
 
   cpp-Phi-1_5:
     runs-on: ubuntu-20.04-16-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -470,11 +525,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
@@ -506,6 +564,9 @@ jobs:
 
   cpp-greedy_causal_lm-redpajama-3b-chat:
     runs-on: ubuntu-20.04-4-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -518,11 +579,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
@@ -554,6 +618,9 @@ jobs:
 
   cpp-chat_sample-ubuntu:
     runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -566,11 +633,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
@@ -614,6 +684,9 @@ jobs:
 
   cpp-continuous-batching-ubuntu:
     runs-on: ubuntu-20.04-8-cores
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -626,11 +699,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
@@ -672,11 +748,14 @@ jobs:
           unzip -d ov ov.zip
           dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
         shell: bash
-      - name: Install dependencies and build
+      - name: Build app
         run: |
           call .\ov\setupvars.bat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --parallel --verbose
+          cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          call .\ov\setupvars.bat
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
@@ -699,6 +778,9 @@ jobs:
 
   cpp-continuous-batching-macos:
     runs-on: macos-12
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
@@ -711,11 +793,14 @@ jobs:
           mkdir ./ov/
           curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
           brew install coreutils scons
-      - name: Download, convert and build
+      - name: Build app
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
+      - name: Download and convert and model
+        run: |
+          source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0f53505c20..e6458d27b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,10 @@ if(ENABLE_PYTHON)
     endif()
 endif()
 
+if(WIN32 OR APPLE)
+  set(CMAKE_DEBUG_POSTFIX "d")
+endif()
+
 add_subdirectory(thirdparty)
 add_subdirectory(src)
 add_subdirectory(samples)
diff --git a/src/cpp/src/tokenizers_path.cpp b/src/cpp/src/tokenizers_path.cpp
index e247a65a4c..d0cd82c265 100644
--- a/src/cpp/src/tokenizers_path.cpp
+++ b/src/cpp/src/tokenizers_path.cpp
@@ -57,12 +57,17 @@ std::string get_ov_genai_library_path() {
 }
 
 std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) {
+#if !defined(NDEBUG) && (defined(__APPLE__) || defined(_WIN32))
+# define LIB_POSTFIX "d"
+#else
+# define LIB_POSTFIX ""
+#endif
 #ifdef _WIN32
-    constexpr char tokenizers[] = "openvino_tokenizers.dll";
+    constexpr char tokenizers[] = "openvino_tokenizers" LIB_POSTFIX ".dll";
 #elif defined(__linux__)
-    constexpr char tokenizers[] = "libopenvino_tokenizers.so";
+    constexpr char tokenizers[] = "libopenvino_tokenizers" LIB_POSTFIX ".so";
 #elif defined(__APPLE__)
-    constexpr char tokenizers[] = "libopenvino_tokenizers.dylib";
+    constexpr char tokenizers[] = "libopenvino_tokenizers" LIB_POSTFIX ".dylib";
 #else
 #    error "Unsupported OS"
 #endif
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index cb79143c51..fcfa71edb4 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit cb79143c51b426915c23b088185cf4e9bbf3ff8c
+Subproject commit fcfa71edb4f31cbfc349944a64ea333db4a784fa

From 0a53c9196f36b1acff1d3564abf5a69f386366b8 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 6 Sep 2024 04:33:24 +0200
Subject: [PATCH 13/13] Updated tokenizers

---
 thirdparty/openvino_tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index fcfa71edb4..b6c36a3026 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit fcfa71edb4f31cbfc349944a64ea333db4a784fa
+Subproject commit b6c36a302696329f008e4425c9d98c4e00194a24