diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index f3424367ef..b19dbbe06d 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -13,12 +13,15 @@ concurrency: cancel-in-progress: true env: - l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/l_openvino_toolkit_ubuntu20_2024.4.0.dev20240828_x86_64.tgz - m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/m_openvino_toolkit_macos_12_6_2024.4.0.dev20240828_x86_64.tgz - w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/w_openvino_toolkit_windows_2024.4.0.dev20240828_x86_64.zip + l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/m_openvino_toolkit_macos_12_6_2024.5.0.dev20240830_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip jobs: cpp-multinomial-greedy_causal_lm-ubuntu: runs-on: ubuntu-20.04-8-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -31,14 +34,17 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - run: > . ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH timeout 25s @@ -62,6 +68,10 @@ jobs: python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py, ] runs-on: ubuntu-20.04 + if: ${{ false }} # fails because of UNICODE output + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -74,14 +84,17 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - name: Compare run: | source ./ov/setupvars.sh @@ -191,6 +204,7 @@ jobs: cpp-greedy_causal_lm-windows: runs-on: windows-latest + if: ${{ false }} # TODO: fix Windows env: PYTHONIOENCODING: "utf8" defaults: @@ -203,18 +217,23 @@ jobs: - uses: actions/setup-python@v4 with: python-version: 3.8 + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 - run: curl --output ov.zip ${{ env.w_ov_link }} - run: unzip -d ov ov.zip - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" shell: bash - - name: Download, convert and build + - name: Build app + run: | + call .\ov\setupvars.bat + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert model run: | call .\ov\setupvars.bat python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - run: > set PATH=.\build\openvino_genai\;%PATH% && call .\ov\setupvars.bat @@ -240,6 +259,9 @@ jobs: cpp-beam_search_causal_lm-Qwen-7B-Chat: runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -252,14 +274,17 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH @@ -267,6 +292,9 @@ jobs: cpp-beam_search_causal_lm-Qwen1_5-7B-Chat: runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -279,14 +307,17 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH @@ -295,6 +326,9 @@ jobs: cpp-beam_search_causal_lm-Phi-2: runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -307,14 +341,17 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j 15 - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH @@ -323,6 +360,9 @@ jobs: cpp-beam_search_causal_lm-notus-7b-v1: runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -335,14 +375,17 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH @@ -351,6 +394,9 @@ jobs: cpp-speculative_decoding_lm-ubuntu: runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -363,15 +409,18 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - name: run and compare run: | source ./ov/setupvars.sh @@ -388,6 +437,9 @@ jobs: cpp-prompt_lookup_decoding_lm-ubuntu: runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -400,15 +452,18 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - name: run and compare run: | source ./ov/setupvars.sh @@ -455,6 +510,9 @@ jobs: cpp-Phi-1_5: runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -467,14 +525,17 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j 15 - name: Run Generation run: | source ./ov/setupvars.sh @@ -503,6 +564,9 @@ jobs: cpp-greedy_causal_lm-redpajama-3b-chat: runs-on: ubuntu-20.04-4-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -515,19 +579,20 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - - run: source ./ov/setupvars.sh && convert_tokenizer ./redpajama-3b-chat/ --output ./redpajama-3b-chat/ --with-detokenizer --trust-remote-code - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | @@ -553,7 +618,9 @@ jobs: cpp-chat_sample-ubuntu: runs-on: ubuntu-20.04 - if: false # Skip temporarily until https://github.com/openvinotoolkit/openvino_tokenizers/pull/235 is merged. + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -566,14 +633,17 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - name: Compare run: | source ./ov/setupvars.sh @@ -614,6 +684,9 @@ jobs: cpp-continuous-batching-ubuntu: runs-on: ubuntu-20.04-8-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -626,14 +699,17 @@ jobs: mkdir ./ov/ curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - name: Run gtests run: | source ./ov/setupvars.sh @@ -651,6 +727,7 @@ jobs: cpp-continuous-batching-windows: runs-on: windows-latest + if: ${{ false }} # TODO: fix Windows env: PYTHONIOENCODING: "utf8" defaults: @@ -663,20 +740,25 @@ jobs: - uses: actions/setup-python@v4 with: python-version: 3.8 + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 - name: Install OpenVINO run: | curl --output ov.zip ${{ env.w_ov_link }} unzip -d ov ov.zip dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" shell: bash - - name: Install dependencies and build + - name: Build app + run: | + call .\ov\setupvars.bat + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | call .\ov\setupvars.bat python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - name: Run gtests run: | set PATH=.\build\openvino_genai\;%PATH% @@ -696,6 +778,9 @@ jobs: cpp-continuous-batching-macos: runs-on: macos-12 + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: @@ -708,14 +793,17 @@ jobs: mkdir ./ov/ curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz brew install coreutils scons - - name: Download, convert and build + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release -j - name: Run gtests run: | source ./ov/setupvars.sh diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index 7ef0360fa5..e76d04a073 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -16,8 +16,8 @@ permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b env: WORKING_DIRECTORY: "./image_generation/lcm_dreamshaper_v7/cpp/" PYTHON_VERSION: '3.8' - LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/l_openvino_toolkit_ubuntu20_2024.4.0.dev20240828_x86_64.tgz - WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/w_openvino_toolkit_windows_2024.4.0.dev20240828_x86_64.zip + LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz + WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip OV_INSTALL_DIR: ${{ github.workspace }}/ov concurrency: @@ -41,36 +41,36 @@ jobs: mkdir ${{ env.OV_INSTALL_DIR }} tar -xzf openvino_package.tar.gz -C ${{ env.OV_INSTALL_DIR }} --strip-components=1 + - name: Build app + working-directory: ${{ env.WORKING_DIRECTORY }} + run: | + source ${{ env.OV_INSTALL_DIR }}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --parallel + - name: Setup Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} cache: 'pip' - + - name: Create virtual environment working-directory: ${{ env.WORKING_DIRECTORY }} run: python3 -m venv openvino_lcm_cpp - + - name: Install python dependencies working-directory: ${{ env.WORKING_DIRECTORY }} run: | source openvino_lcm_cpp/bin/activate python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - + - name: Download and convert model and tokenizer working-directory: ${{ env.WORKING_DIRECTORY }} run: | source openvino_lcm_cpp/bin/activate optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 models/lcm_dreamshaper_v7/FP16 - - name: Build app - working-directory: ${{ env.WORKING_DIRECTORY }} - run: | - source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --parallel - - name: Run app working-directory: ${{ env.WORKING_DIRECTORY }} run: | @@ -96,16 +96,23 @@ jobs: mv ./tmp/*/* . popd + - name: Build app + working-directory: ${{ env.WORKING_DIRECTORY }} + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --parallel + - name: Setup Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} cache: 'pip' - + - name: Create virtual environment working-directory: ${{ env.WORKING_DIRECTORY }} run: python -m venv openvino_lcm_cpp - + - name: Install python dependencies working-directory: ${{ env.WORKING_DIRECTORY }} run: | @@ -118,14 +125,7 @@ jobs: run: | . "./openvino_lcm_cpp/Scripts/Activate.ps1" optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 models/lcm_dreamshaper_v7/FP16 - - - name: Build app - working-directory: ${{ env.WORKING_DIRECTORY }} - run: | - . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --parallel - + - name: Run app working-directory: ${{ env.WORKING_DIRECTORY }} run: | diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index 18194c301c..970fbb4159 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -16,8 +16,8 @@ permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b env: WORKING_DIRECTORY: "./image_generation/stable_diffusion_1_5/cpp/" PYTHON_VERSION: '3.8' - LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/l_openvino_toolkit_ubuntu20_2024.4.0.dev20240828_x86_64.tgz - WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16527-382ac845923/w_openvino_toolkit_windows_2024.4.0.dev20240828_x86_64.zip + LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz + WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip OV_INSTALL_DIR: ${{ github.workspace }}/ov concurrency: @@ -41,12 +41,19 @@ jobs: mkdir ${{ env.OV_INSTALL_DIR }} tar -xzf openvino_package.tar.gz -C ${{ env.OV_INSTALL_DIR }} --strip-components=1 + - name: Build app + working-directory: ${{ env.WORKING_DIRECTORY }} + run: | + source ${{ env.OV_INSTALL_DIR }}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --parallel + - name: Setup Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} cache: 'pip' - + - name: Create virtual environment working-directory: ${{ env.WORKING_DIRECTORY }} run: python3 -m venv openvino_sd_cpp @@ -62,14 +69,7 @@ jobs: working-directory: ${{ env.WORKING_DIRECTORY }} run: | source openvino_sd_cpp/bin/activate - optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16 - - - name: Build app - working-directory: ${{ env.WORKING_DIRECTORY }} - run: | - source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --parallel + optimum-cli export openvino --model botp/stable-diffusion-v1-5 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16 - name: Run app working-directory: ${{ env.WORKING_DIRECTORY }} @@ -95,37 +95,37 @@ jobs: Expand-Archive openvino_package.zip -DestinationPath ./tmp mv ./tmp/*/* . popd - + + - name: Build app + working-directory: ${{ env.WORKING_DIRECTORY }} + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --parallel + - name: Setup Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} cache: 'pip' - + - name: Create virtual environment working-directory: ${{ env.WORKING_DIRECTORY }} run: python -m venv openvino_sd_cpp - + - name: Install python dependencies working-directory: ${{ env.WORKING_DIRECTORY }} run: | . "./openvino_sd_cpp/Scripts/Activate.ps1" python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - + - name: Download and convert model and tokenizer working-directory: ${{ env.WORKING_DIRECTORY }} run: | . "./openvino_sd_cpp/Scripts/Activate.ps1" - optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16 - - - name: Build app - working-directory: ${{ env.WORKING_DIRECTORY }} - run: | - . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --parallel - + optimum-cli export openvino --model botp/stable-diffusion-v1-5 --task stable-diffusion models/stable_diffusion_v1_5_ov/FP16 + - name: Run app working-directory: ${{ env.WORKING_DIRECTORY }} run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index a39ca7ed90..e6458d27b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,18 +26,18 @@ if(POLICY CMP0169) endif() project(OpenVINOGenAI - VERSION 2024.4.0.0 + VERSION 2024.5.0.0 DESCRIPTION "OpenVINO GenAI" HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai" LANGUAGES CXX) # Find OpenVINODeveloperPackage first to compile with SDL flags find_package(OpenVINODeveloperPackage ${OpenVINOGenAI_VERSION} QUIET - COMPONENTS Runtime Threading + COMPONENTS Runtime PATHS "${OpenVINO_DIR}") if(NOT OpenVINODeveloperPackage_FOUND) find_package(OpenVINO ${OpenVINOGenAI_VERSION} REQUIRED - COMPONENTS Runtime Threading) + COMPONENTS Runtime) endif() include(cmake/features.cmake) @@ -56,6 +56,10 @@ if(ENABLE_PYTHON) endif() endif() +if(WIN32 OR APPLE) + set(CMAKE_DEBUG_POSTFIX "d") +endif() + add_subdirectory(thirdparty) add_subdirectory(src) add_subdirectory(samples) diff --git a/image_generation/lcm_dreamshaper_v7/cpp/CMakeLists.txt b/image_generation/lcm_dreamshaper_v7/cpp/CMakeLists.txt index e3ab524859..7e7680a393 100644 --- a/image_generation/lcm_dreamshaper_v7/cpp/CMakeLists.txt +++ b/image_generation/lcm_dreamshaper_v7/cpp/CMakeLists.txt @@ -15,7 +15,7 @@ set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake build type") # dependencies -find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) +find_package(OpenVINO REQUIRED COMPONENTS Runtime) include(FetchContent) diff --git a/image_generation/requirements.txt b/image_generation/requirements.txt index b53afc7b9a..bd5d3d677d 100644 --- a/image_generation/requirements.txt +++ b/image_generation/requirements.txt @@ -1,2 +1,2 @@ -r ../samples/requirements.txt -diffusers==0.30.1 +diffusers==0.30.2 diff --git a/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt b/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt index 0e3f140e14..77466668a4 100644 --- a/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt +++ b/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt @@ -15,7 +15,7 @@ set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake build type") # dependencies -find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) +find_package(OpenVINO REQUIRED COMPONENTS Runtime) include(FetchContent) diff --git a/image_generation/stable_diffusion_1_5/cpp/README.md b/image_generation/stable_diffusion_1_5/cpp/README.md index 57cf80d4fa..144f5a0552 100644 --- a/image_generation/stable_diffusion_1_5/cpp/README.md +++ b/image_generation/stable_diffusion_1_5/cpp/README.md @@ -57,7 +57,7 @@ The path to the OpenVINO install directory is referred as `` throug 2. Download the model from Huggingface and convert it to OpenVINO IR via [optimum-intel CLI](https://github.com/huggingface/optimum-intel). Example models to download: - - [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) + - [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5) - [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) Example command for downloading [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) model and exporting it with FP16 precision: diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index b11b66a214..321441364d 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -224,11 +224,6 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) pt_inputs = tokenizer(input_text_list, return_tensors="pt") input_token_size = pt_inputs.input_ids.shape[1] - pipe_tokenizer = model.get_tokenizer() - tok_encode_start = time.perf_counter() - input_data = pipe_tokenizer.encode(input_text_list) - tok_encode_end = time.perf_counter() - tok_encode_time = (tok_encode_end - tok_encode_start) * 1000 if args['batch_size'] > 1: out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) out_str += " Batch_size={}, ".format(args['batch_size']) @@ -243,21 +238,19 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.start_collect_memory_consumption() max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] - streamer.reset() start = time.perf_counter() - generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).tokens + generation_result = model.generate(input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"]) end = time.perf_counter() - log.info(type(generated_tokens[0])) + generated_text = generation_result.texts + perf_metrics = generation_result.perf_metrics + if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() mem_consumption.clear_max_memory_consumption() generation_time = end - start - tok_decode_start = time.perf_counter() - generated_text = pipe_tokenizer.decode(generated_tokens) - tok_decode_end = time.perf_counter() - tok_decode_time = (tok_decode_end - tok_decode_start) * 1000 + generated_tokens = [tokenizer(text).input_ids for text in generated_text] # Only text_gen need to minus length of input_data, because generated_text may include input_text num_tokens = 0 result_md5_list = [] @@ -275,9 +268,13 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data else: md5_list[num][prompt_index] = result_md5_list per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) - tm_list = streamer.get_time_list() + tm_list = np.array(perf_metrics.raw_metrics.m_durations) / 1000 / 1000 log.debug('latency of all tokens:') [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + tokenization_time = ( + np.mean(perf_metrics.raw_metrics.tokenization_durations) / 1000, + np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000 + ) iter_data = gen_iterate_data( num, input_token_size * args['batch_size'], @@ -290,19 +287,19 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, prompt_idx=prompt_index, - tokenization_time=(tok_encode_time, tok_decode_time) + tokenization_time=tokenization_time ) iter_data_list.append(iter_data) llm_bench_utils.metrics_print.print_metrics( num, iter_data, - tm_list, + tm_list.tolist(), [], warm_up=(num == 0), max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, max_uss_mem=max_uss_mem_consumption, - tokenization_time=(tok_encode_time, tok_decode_time), + tokenization_time=tokenization_time, batch_size=args['batch_size'] ) if num > 0: @@ -320,7 +317,6 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data assert (result_md5_list == prev_md5) else: llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) - streamer.reset() def run_text_generation_benchmark(model_path, framework, device, args, num_iters): @@ -726,7 +722,8 @@ def get_argprser(): def main(): - log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout, encoding="utf-8") + logging_kwargs = {"encoding": "utf-8"} if sys.version_info[1] > 8 else {} + log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout, **logging_kwargs) args = get_argprser() model_path, framework, model_args, model_name = llm_bench_utils.model_utils.analyze_args(args) diff --git a/llm_bench/python/llm_bench_utils/ov_utils.py b/llm_bench/python/llm_bench_utils/ov_utils.py index cd91991a53..1f9c6b6a31 100644 --- a/llm_bench/python/llm_bench_utils/ov_utils.py +++ b/llm_bench/python/llm_bench_utils/ov_utils.py @@ -144,9 +144,7 @@ def create_text_gen_model(model_path, device, **kwargs): raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') else: if kwargs.get("genai", False) and is_genai_available(log_msg=True): - if kwargs["batch_size"] > 1 or kwargs["num_beams"] > 1: - log.warning("OpenVINO GenAI based benchmarking implmented only for batch_size == 1 and num_beams == 1") - elif model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"]]: + if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"]]: log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking") else: return create_genai_text_gen_model(model_path, device, ov_config, **kwargs) @@ -183,34 +181,6 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): import openvino_genai from transformers import AutoTokenizer - class TokenStreamer(openvino_genai.StreamerBase): - def __init__(self, tokenizer): - openvino_genai.StreamerBase.__init__(self) - self.tokenizer = tokenizer - self.token_generation_time = [] - self.generated_tokens = [] - self.start_time = time.perf_counter() - - def put(self, token_id): - self.token_generation_time.append(time.perf_counter() - self.start_time) - self.generated_tokens.append(token_id) - self.start_time = time.perf_counter() - return False - - def reset(self): - self.token_generation_time = [] - self.generated_tokens = [] - self.start_time = time.perf_counter() - - def end(self): - pass - - def get_tokens(self): - return self.generated_tokens - - def get_time_list(self): - return self.token_generation_time - if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists(): convert_ov_tokenizer(model_path) @@ -220,9 +190,8 @@ def get_time_list(self): llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper(), ov_config) end = time.perf_counter() log.info(f'Pipeline initialization time: {end - start:.2f}s') - streamer = TokenStreamer(llm_pipe.get_tokenizer()) - return llm_pipe, tokenizer, end - start, streamer, True + return llm_pipe, tokenizer, end - start, None, True def convert_ov_tokenizer(tokenizer_path): diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt index ae31250edb..b944d0585a 100644 --- a/llm_bench/python/requirements.txt +++ b/llm_bench/python/requirements.txt @@ -10,7 +10,7 @@ torch transformers>=4.40.0 diffusers>=0.22.0 #optimum is in dependency list of optimum-intel -git+https://github.com/huggingface/optimum-intel.git@e9800ced0f6ceaa7aa0afe67327bfe348815620d#egg=optimum-intel +git+https://github.com/huggingface/optimum-intel.git@9a8782446e394ac07283b8bd8b44916c4f297826#egg=optimum-intel git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf packaging psutil diff --git a/pyproject.toml b/pyproject.toml index 1ea9c9b85f..a1ac58a6a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "openvino_genai" -version = "2024.4.0.0" +version = "2024.5.0.0" description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai" requires-python = ">=3.8" readme = {file = "src/README.md", content-type="text/markdown"} @@ -16,7 +16,7 @@ classifiers = [ "Programming Language :: Python :: 3.12", ] dependencies = [ - "openvino_tokenizers~=2024.4.0.0.dev" + "openvino_tokenizers~=2024.5.0.0.dev" ] [tool.py-build-cmake.module] diff --git a/samples/cpp/beam_search_causal_lm/CMakeLists.txt b/samples/cpp/beam_search_causal_lm/CMakeLists.txt index 9ea4730528..9bf1a8aac8 100644 --- a/samples/cpp/beam_search_causal_lm/CMakeLists.txt +++ b/samples/cpp/beam_search_causal_lm/CMakeLists.txt @@ -1,9 +1,11 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +find_package(OpenVINOGenAI REQUIRED + HINTS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH ) add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt index 3a05c37d62..902a05eee6 100644 --- a/samples/cpp/benchmark_genai/CMakeLists.txt +++ b/samples/cpp/benchmark_genai/CMakeLists.txt @@ -1,9 +1,11 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH ) include(FetchContent) diff --git a/samples/cpp/chat_sample/CMakeLists.txt b/samples/cpp/chat_sample/CMakeLists.txt index 901f003d4c..69578dc86c 100644 --- a/samples/cpp/chat_sample/CMakeLists.txt +++ b/samples/cpp/chat_sample/CMakeLists.txt @@ -1,9 +1,11 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH ) add_executable(chat_sample chat_sample.cpp) diff --git a/samples/cpp/greedy_causal_lm/CMakeLists.txt b/samples/cpp/greedy_causal_lm/CMakeLists.txt index 409733bbc6..ff5151676f 100644 --- a/samples/cpp/greedy_causal_lm/CMakeLists.txt +++ b/samples/cpp/greedy_causal_lm/CMakeLists.txt @@ -1,9 +1,11 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH ) add_executable(greedy_causal_lm greedy_causal_lm.cpp) diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt index 01b3bb3bb4..83b2335431 100644 --- a/samples/cpp/multinomial_causal_lm/CMakeLists.txt +++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt @@ -1,9 +1,11 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH ) add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt index 9b7a15131d..c899c6e47b 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -3,9 +3,11 @@ find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH ) add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 3419f3221a..5e372a3f09 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -238,7 +238,7 @@ int main(int argc, char* argv[]) try { ov::Tensor position_ids = model.get_tensor("position_ids"); position_ids.set_shape(input_ids.get_shape()); std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); - uint64_t seq_len = input_ids.get_shape()[1]; + size_t seq_len = input_ids.get_shape()[1]; // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1 model.get_tensor("beam_idx").set_shape({BATCH_SIZE}); diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt index 1a9b02f1b2..078ac8bb52 100644 --- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt +++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt @@ -3,9 +3,11 @@ find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH ) add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index 83eb74fe6d..66c7c231fa 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -319,7 +319,7 @@ int main(int argc, char* argv[]) try { ov::InferRequest draft_model = core.compile_model(ov_draft_model, "CPU").create_infer_request(); - uint64_t seq_len = input_ids.get_shape()[1]; + size_t seq_len = input_ids.get_shape()[1]; // main model (which is bigger, more accurate but slower) std::shared_ptr ov_main_model = core.read_model(std::string{argv[2]} + "/openvino_model.xml"); diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 56e19fbd9f..7bdc7cd863 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -15,8 +15,8 @@ endif() function(ov_genai_build_jinja2cpp) FetchContent_Declare(jinja2cpp - URL https://github.com/jinja2cpp/Jinja2Cpp/archive/bcc0e30f17f17f738ec2a7a31316d6efbe78a0e0.tar.gz - URL_HASH SHA256=f76547deb323240e7d181ecda6f658757ea5eb07ce772cf39e8bd20467412164) + URL https://github.com/jinja2cpp/Jinja2Cpp/archive/b32fbde7d98d13c34784c332c4a24a6f92c76e38.tar.gz + URL_HASH SHA256=7cc25ddbc438a5c874d404e100b4eccd8a331c195417f5487c48aebcf4b9e7fb) FetchContent_GetProperties(jinja2cpp) if(NOT jinja2cpp_POPULATED) diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index ad53d8d941..f256a123de 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -37,9 +37,6 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { std::vector m_new_token_times; std::vector m_batch_sizes; std::vector m_durations; - - size_t num_generated_tokens; - size_t num_input_tokens; }; /** @@ -111,15 +108,15 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { size_t num_input_tokens; float get_load_time(); // Load time in ms. - float get_num_generated_tokens(); - float get_num_input_tokens(); - MeanStdPair get_ttft(); // Time to the first token (in ms) (TTTFT). - MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT). - MeanStdPair get_throughput(); // Tokens per second. + size_t get_num_generated_tokens(); + size_t get_num_input_tokens(); + MeanStdPair get_ttft(); // Time to the first token (in ms) (TTFT). + MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT). + MeanStdPair get_throughput(); // Tokens per second. - MeanStdPair get_generate_duration(); - MeanStdPair get_tokenization_duration(); - MeanStdPair get_detokenization_duration(); + MeanStdPair get_generate_duration(); // in ms + MeanStdPair get_tokenization_duration(); // in ms + MeanStdPair get_detokenization_duration(); // in ms // Flag indicating if raw metrics were evaluated. // If false means current mean/std ttft, tpot, etc. are not actual diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index d89d63faa9..66e2890671 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -171,7 +171,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time)); raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time)); - + + // Added tokenization/detokenization times, and updated generate duration, need to reevaluate statistics. + decoded_results.perf_metrics.m_evaluated = false; decoded_results.perf_metrics.evaluate_statistics(start_time); return decoded_results; } diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 71a76bc922..002323dd46 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -9,6 +9,8 @@ #include "utils.hpp" #include +#include +#include namespace { @@ -83,9 +85,39 @@ std::shared_ptr add_slices_to_kvcache_inputs(const std::shared_ptr(model->get_results(), ov::SinkVector{}, new_params); } +struct KVAxesPosition { + uint32_t batch; + uint32_t seq_len; +}; + +KVAxesPosition get_kv_axes(const std::string& model_type) { + KVAxesPosition axes; + if (model_type == "chatglm") { + axes.batch = 1u; + axes.seq_len = 0u; + } else if (model_type == "qwen") { + // Note, qwen2 does not fall into this category and conforms to default layout + axes.batch = 0u; + axes.seq_len = 1u; + } else { + axes.batch = 0u; + axes.seq_len = 2u; + } + return axes; +} + +std::string get_model_type_from_json(const std::filesystem::path& filepath) { + std::ifstream file(filepath); + OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string()); + nlohmann::json config_data = nlohmann::json::parse(file); + std::string model_type = config_data["model_type"].get(); + return model_type; +} + void reshape_to_static(std::shared_ptr model, const uint32_t input_size, - const uint32_t kvcache_size) { + const uint32_t kvcache_size, + const KVAxesPosition& kv_axes_position) { std::map new_shapes; for (auto input : model->inputs()) { const auto& input_name = input.get_any_name(); @@ -98,10 +130,9 @@ void reshape_to_static(std::shared_ptr model, new_shape = ov::PartialShape({1, input_size}); } else { const auto& partial_shape = input.get_partial_shape(); - new_shape = ov::PartialShape({1, - partial_shape[1].get_length(), - kvcache_size-input_size, - partial_shape[3].get_length()}); + new_shape = partial_shape; + new_shape[kv_axes_position.batch] = 1; + new_shape[kv_axes_position.seq_len] = kvcache_size - input_size; } new_shapes.emplace(input_name, new_shape); } @@ -222,10 +253,10 @@ StaticLLMPipeline::StaticLLMPipeline( // (6) Reshape both models to static shape const auto kMaxPromptLen = pop_or_default(pipeline_config, "MAX_PROMPT_LEN", 1024u); const auto kMinResponseLen = pop_or_default(pipeline_config, "MIN_RESPONSE_LEN", 150u); - // FIXME For some models KV-cache dim != 2u - m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, 2u }; - reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size); - reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size); + KVAxesPosition axes = get_kv_axes(get_model_type_from_json(path / "config.json")); + m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len }; + reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); + reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); // (7) Compile both model auto prefill_config = pop_or_default(pipeline_config, "PREFILL_CONFIG", get_default_prefill_config()); auto generate_config = pop_or_default(pipeline_config, "GENERATE_CONFIG", get_default_generate_config()); @@ -276,28 +307,46 @@ DecodedResults StaticLLMPipeline::generate( OptionalGenerationConfig generation_config, StreamerVariant streamer ) { + auto start_time = std::chrono::steady_clock::now(); GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; - if (std::holds_alternative>(inputs)) { - OPENVINO_THROW("Currently only batch size=1 is supported"); - } - - OPENVINO_ASSERT(std::holds_alternative(inputs)); - auto& prompt = std::get(inputs); - - if (m_is_chat_conversation) { - m_history.push_back({{"role", "user"}, {"content", prompt}}); - constexpr bool add_generation_prompt = true; - prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + TokenizedInputs tokenized_input; + if (auto input_vector = std::get_if>(&inputs)) { + // OPENVINO_ASSERT(!m_is_chat_conversation, "Can't chat with multiple prompts"); + auto& strings = std::get>(inputs); + if (strings.size() != 1) { + OPENVINO_THROW("Currently only batch size=1 is supported"); + } else { + tokenized_input = m_tokenizer.encode(*input_vector); + } + } else if (auto input_prompt = std::get_if(&inputs)) { + std::string& prompt = *input_prompt; + if (m_is_chat_conversation) { + m_history.push_back({{"role", "user"}, {"content", prompt}}); + constexpr bool add_generation_prompt = true; + prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + } + tokenized_input = m_tokenizer.encode(prompt); } - auto tokenized_input = m_tokenizer.encode(prompt); + auto encode_stop_time = std::chrono::steady_clock::now(); auto encoded_results = generate(tokenized_input, config, streamer); + auto decode_start_time = std::chrono::steady_clock::now(); DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + auto decode_stop_time = std::chrono::steady_clock::now(); if (m_is_chat_conversation) { auto answer = decoded_results.texts[0]; m_history.push_back({{"role", "assistant"}, {"content", answer}}); } + // generate_durations + decoded_results.perf_metrics = encoded_results.perf_metrics; + auto& raw_counters = decoded_results.perf_metrics.raw_metrics; + auto stop_time = std::chrono::steady_clock::now(); + raw_counters.generate_durations = std::vector(); + raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time)); + raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time)); + decoded_results.perf_metrics.evaluate_statistics(start_time); return decoded_results; } @@ -306,6 +355,7 @@ EncodedResults StaticLLMPipeline::generate( OptionalGenerationConfig generation_config, StreamerVariant streamer ) { + auto start_time = std::chrono::steady_clock::now(); ov::Tensor input_ids; ov::Tensor attention_mask; @@ -340,7 +390,10 @@ EncodedResults StaticLLMPipeline::generate( OPENVINO_THROW("Currently only greedy decoding is supported"); } + ov::Shape prompts_shape = input_ids.get_shape(); + const size_t batch_size = prompts_shape[0]; ov::genai::EncodedResults results; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; // NB: Only batch=1 is supported now results.scores.resize(1u); results.scores[0] = 0u; @@ -370,6 +423,8 @@ EncodedResults StaticLLMPipeline::generate( std::iota(padded_pos_data + offset, padded_pos_data + padded_position_ids.get_size(), 0u); m_prefill_request.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); // NB: Now there are prompt_len tokens in KV-cache m_kvcache_desc.num_stored_tokens += prompt_len; @@ -423,6 +478,8 @@ EncodedResults StaticLLMPipeline::generate( last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0); results.tokens[0].push_back(last_token); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); if (streamer_ptr && streamer_ptr->put(last_token)) { break; } @@ -447,6 +504,13 @@ EncodedResults StaticLLMPipeline::generate( m_kvcache_request.get_tensor(output_name).copy_to(kvcache_in_slice); } } + auto stop_time = std::chrono::steady_clock::now(); + // If is called without tokenization then that stat will not be reported. + auto& metrics = results.perf_metrics; + metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); + metrics.load_time = this->m_load_time_ms; + metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + metrics.evaluate_statistics(start_time); return results; } diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 2f378ab302..9ed0233232 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -10,6 +10,9 @@ namespace { ov::genai::MeanStdPair calc_mean_and_std(const std::vector& durations) { + if (durations.size() == 0) { + return {-1, -1}; + } // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds. float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { @@ -36,14 +39,14 @@ float PerfMetrics::get_load_time() { return load_time; } -float PerfMetrics::get_num_generated_tokens() { +size_t PerfMetrics::get_num_generated_tokens() { evaluate_statistics(); return num_generated_tokens; } -float PerfMetrics::get_num_input_tokens() { +size_t PerfMetrics::get_num_input_tokens() { evaluate_statistics(); - return num_generated_tokens; + return num_input_tokens; } MeanStdPair PerfMetrics::get_ttft() { diff --git a/src/cpp/src/tokenizers_path.cpp b/src/cpp/src/tokenizers_path.cpp index e247a65a4c..d0cd82c265 100644 --- a/src/cpp/src/tokenizers_path.cpp +++ b/src/cpp/src/tokenizers_path.cpp @@ -57,12 +57,17 @@ std::string get_ov_genai_library_path() { } std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { +#if !defined(NDEBUG) && (defined(__APPLE__) || defined(_WIN32)) +# define LIB_POSTFIX "d" +#else +# define LIB_POSTFIX "" +#endif #ifdef _WIN32 - constexpr char tokenizers[] = "openvino_tokenizers.dll"; + constexpr char tokenizers[] = "openvino_tokenizers" LIB_POSTFIX ".dll"; #elif defined(__linux__) - constexpr char tokenizers[] = "libopenvino_tokenizers.so"; + constexpr char tokenizers[] = "libopenvino_tokenizers" LIB_POSTFIX ".so"; #elif defined(__APPLE__) - constexpr char tokenizers[] = "libopenvino_tokenizers.dylib"; + constexpr char tokenizers[] = "libopenvino_tokenizers" LIB_POSTFIX ".dylib"; #else # error "Unsupported OS" #endif diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index 9e01068972..6c7bbf39ef 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -12,15 +12,17 @@ os.add_dll_directory(os.path.dirname(__file__)) from .py_generate_pipeline import ( - LLMPipeline, - Tokenizer, - GenerationConfig, - TokenizedInputs, + ContinuousBatchingPipeline, DecodedResults, EncodedResults, - StreamerBase, - StopCriteria, - ContinuousBatchingPipeline, + GenerationConfig, GenerationResult, + LLMPipeline, + PerfMetrics, + RawPerfMetrics, SchedulerConfig, + StopCriteria, + StreamerBase, + TokenizedInputs, + Tokenizer ) diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index aada9b0939..225257f0d9 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -247,22 +247,22 @@ auto perf_metrics_docstring = R"( :param get_num_input_tokens: Returns the number of tokens in the input prompt. :type get_num_input_tokens: int - :param get_ttft: Returns the mean and standard deviation of TTFT. + :param get_ttft: Returns the mean and standard deviation of TTFT in milliseconds. :type get_ttft: MeanStdPair - :param get_tpot: Returns the mean and standard deviation of TPOT. + :param get_tpot: Returns the mean and standard deviation of TPOT in milliseconds. :type get_tpot: MeanStdPair - :param get_throughput: Returns the mean and standard deviation of throughput. + :param get_throughput: Returns the mean and standard deviation of throughput in tokens per second. :type get_throughput: MeanStdPair - :param get_generate_duration: Returns the mean and standard deviation of generate duration. + :param get_generate_duration: Returns the mean and standard deviation of generate durations in milliseconds. :type get_generate_duration: MeanStdPair - :param get_tokenization_duration: Returns the mean and standard deviation of tokenization duration. + :param get_tokenization_duration: Returns the mean and standard deviation of tokenization durations in milliseconds. :type get_tokenization_duration: MeanStdPair - :param get_detokenization_duration: Returns the mean and standard deviation of detokenization duration. + :param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds. :type get_detokenization_duration: MeanStdPair :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. @@ -763,10 +763,12 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "RawPerfMetrics", raw_perf_metrics_docstring) .def(py::init<>()) - .def_readonly("generate_durations", &RawPerfMetrics::generate_durations) + .def_property_readonly("generate_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::generate_durations); + }) .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { return get_ms(rw, &RawPerfMetrics::tokenization_durations); - }) + }) .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { return get_ms(rw, &RawPerfMetrics::detokenization_durations); }) @@ -776,24 +778,27 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) { return get_ms(rw, &RawPerfMetrics::m_durations); }) - .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes) - .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens) - .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens); + .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes); py::class_(m, "MeanStdPair") .def(py::init<>()) .def_readonly("mean", &MeanStdPair::mean) - .def_readonly("std", &MeanStdPair::std); + .def_readonly("std", &MeanStdPair::std) + .def("__iter__", [](const MeanStdPair &self) { + return py::make_iterator(&self.mean, &self.std + 1); + }, py::keep_alive<0, 1>()); // Keep object alive while the iterator is used; py::class_(m, "PerfMetrics", perf_metrics_docstring) .def(py::init<>()) + .def("get_load_time", &PerfMetrics::get_load_time) + .def("get_num_generated_tokens", &PerfMetrics::get_num_generated_tokens) + .def("get_num_input_tokens", &PerfMetrics::get_num_input_tokens) + .def("get_ttft", &PerfMetrics::get_ttft) + .def("get_tpot", &PerfMetrics::get_tpot) + .def("get_throughput", &PerfMetrics::get_throughput) .def("get_generate_duration", &PerfMetrics::get_generate_duration) .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration) .def("get_detokenization_duration", &PerfMetrics::get_detokenization_duration) - .def("get_throughput", &PerfMetrics::get_throughput) - .def("get_tpot", &PerfMetrics::get_tpot) - .def("get_ttft", &PerfMetrics::get_ttft) - .def("get_load_time", &PerfMetrics::get_load_time) .def("__add__", &PerfMetrics::operator+) .def("__iadd__", &PerfMetrics::operator+=) .def_readonly("raw_metrics", &PerfMetrics::raw_metrics); diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 9a02f506bb..f80729d425 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -723,3 +723,87 @@ def test_cb_streamer_vs_return_vs_stateful(prompt): reference = stateful.generate(prompt, max_new_tokens=20) assert generated == "".join(streamed) assert "".join(streamed) == reference + +def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics: + model_id, path, tokenizer, model, pipe = model_descr + + config = generation_config.copy() # to avoid side effects + + if 'do_sample' not in config: + # Some HF models have default do_sample = True, and if we set beam search generation config + # it conflicts with `diversity_penalty` and/or `num_beam_groups`. + # Need to set explicitly to False, but only if test arguments omitted this arg. + # Do not apply 'repetition_penalty' if sampling is not used. + config['do_sample'] = False + config['repetition_penalty'] = None + return pipe.generate([prompt], **config).perf_metrics + + +test_cases = [ + (dict(max_new_tokens=20), 'table is made of'), +] +@pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_perf_metrics(model_descr, generation_config, prompt): + import time + start_time = time.perf_counter() + perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt) + total_time = (time.perf_counter() - start_time) * 1000 + + # Check that load time is adequate. + load_time = perf_metrics.get_load_time() + assert load_time > 0 and load_time < 1000.0 + + # Check that num input and generated tokens are adequate. + num_generated_tokens = perf_metrics.get_num_generated_tokens() + assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens'] + + num_input_tokens = perf_metrics.get_num_input_tokens() + assert num_input_tokens > 0 and num_input_tokens <= len(prompt) + + mean_ttft, std_ttft = perf_metrics.get_ttft() + assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std) + assert mean_ttft > 0 and mean_ttft < 1000.0 + + mean_tpot, std_tpot = perf_metrics.get_tpot() + assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std) + assert mean_tpot > 0 and mean_ttft < 1000.0 + + mean_throughput, std_throughput = perf_metrics.get_throughput() + assert (mean_throughput, std_throughput) == (perf_metrics.get_throughput().mean, perf_metrics.get_throughput().std) + assert mean_throughput > 0 and mean_throughput < 20000.0 + + mean_gen_duration, std_gen_duration = perf_metrics.get_generate_duration() + assert (mean_gen_duration, std_gen_duration) == (perf_metrics.get_generate_duration().mean, perf_metrics.get_generate_duration().std) + assert mean_gen_duration > 0 and load_time + mean_gen_duration < total_time + assert std_gen_duration == 0 + + mean_tok_duration, std_tok_duration = perf_metrics.get_tokenization_duration() + assert (mean_tok_duration, std_tok_duration) == (perf_metrics.get_tokenization_duration().mean, perf_metrics.get_tokenization_duration().std) + assert mean_tok_duration > 0 and mean_tok_duration < mean_gen_duration + assert std_tok_duration == 0 + + mean_detok_duration, std_detok_duration = perf_metrics.get_detokenization_duration() + assert (mean_detok_duration, std_detok_duration) == (perf_metrics.get_detokenization_duration().mean, perf_metrics.get_detokenization_duration().std) + assert mean_detok_duration > 0 and mean_detok_duration < mean_gen_duration + assert std_detok_duration == 0 + + # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics + raw_metrics = perf_metrics.raw_metrics + raw_dur = np.array(raw_metrics.generate_durations) / 1000 + assert np.allclose(mean_gen_duration, np.mean(raw_dur)) + assert np.allclose(std_gen_duration, np.std(raw_dur)) + + raw_dur = np.array(raw_metrics.tokenization_durations) / 1000 + assert np.allclose(mean_tok_duration, np.mean(raw_dur)) + assert np.allclose(std_tok_duration, np.std(raw_dur)) + + raw_dur = np.array(raw_metrics.detokenization_durations) / 1000 + assert np.allclose(mean_detok_duration, np.mean(raw_dur)) + assert np.allclose(std_detok_duration, np.std(raw_dur)) + + assert len(raw_metrics.m_times_to_first_token) > 0 + assert len(raw_metrics.m_batch_sizes) > 0 + assert len(raw_metrics.m_durations) > 0 diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index 018a7b2013..b6c36a3026 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit 018a7b2013bb61ad5c0f62e80209b78734bbba60 +Subproject commit b6c36a302696329f008e4425c9d98c4e00194a24