From 47fbb5e7b78f8b741e687fee029b4079723f3214 Mon Sep 17 00:00:00 2001 From: Zlobin Vladimir Date: Thu, 1 Aug 2024 16:22:07 +0400 Subject: [PATCH] Merge releases/2024/3 into master (#720) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Alina Kladieva Co-authored-by: Anastasiia Pnevskaia Co-authored-by: Nikita Malinin Co-authored-by: Yaroslav Tarkan Co-authored-by: Anatoliy Talamanov Co-authored-by: Pavel Esir Co-authored-by: Miłosz Żeglarski Co-authored-by: Pavel Esir Co-authored-by: Alexander Suvorov Co-authored-by: Xiake Sun Co-authored-by: Damian Kalinowski Co-authored-by: Andrei Kochin Co-authored-by: Ekaterina Aidova --- Dockerfile | 38 ---- samples/CMakeLists.txt | 1 + samples/cpp/benchmark_genai/CMakeLists.txt | 24 ++ samples/cpp/benchmark_genai/README.md | 47 ++++ .../cpp/benchmark_genai/benchmark_genai.cpp | 70 ++++++ .../python/beam_search_causal_lm/README.md | 14 +- samples/python/benchmark_genai/README.md | 47 ++++ .../python/benchmark_genai/benchmark_genai.py | 49 ++++ samples/python/chat_sample/README.md | 16 +- samples/python/greedy_causal_lm/README.md | 14 +- .../python/multinomial_causal_lm/README.md | 14 +- src/README.md | 143 ++++++++++-- .../genai/continuous_batching_pipeline.hpp | 20 +- .../openvino/genai/generation_handle.hpp | 15 ++ .../include/openvino/genai/llm_pipeline.hpp | 6 + .../include/openvino/genai/perf_metrics.hpp | 149 +++++++++++++ .../openvino/genai/scheduler_config.hpp | 2 +- src/cpp/src/block_manager.hpp | 2 +- src/cpp/src/continuous_batching_pipeline.cpp | 146 +++++++++--- src/cpp/src/generation_handle.cpp | 4 + src/cpp/src/generation_stream.hpp | 3 + src/cpp/src/greedy_decoding.cpp | 20 +- src/cpp/src/group_beam_searcher.cpp | 25 ++- src/cpp/src/llm_pipeline.cpp | 210 ++++++++++++++++-- src/cpp/src/llm_pipeline_base.hpp | 2 + src/cpp/src/llm_pipeline_static.cpp | 32 ++- src/cpp/src/llm_pipeline_static.hpp | 4 + src/cpp/src/multinomial_decoding.cpp | 8 +- src/cpp/src/perf_metrics.cpp | 164 ++++++++++++++ src/cpp/src/scheduler.hpp | 2 +- src/cpp/src/synchronized_queue.hpp | 6 + src/cpp/src/tokenizer.cpp | 62 +++--- src/docs/BUILD.md | 196 ++++++++++++---- src/python/py_generate_pipeline.cpp | 185 ++++++++++++++- tests/cpp/scheduler.cpp | 2 - tests/python_tests/ov_genai_test_utils.py | 29 +-- tests/python_tests/test_chat_generate_api.py | 21 +- tests/python_tests/test_generate_api.py | 44 +++- tests/python_tests/tokenizer_configs.py | 8 +- 39 files changed, 1619 insertions(+), 225 deletions(-) delete mode 100644 Dockerfile create mode 100644 samples/cpp/benchmark_genai/CMakeLists.txt create mode 100644 samples/cpp/benchmark_genai/README.md create mode 100644 samples/cpp/benchmark_genai/benchmark_genai.cpp create mode 100644 samples/python/benchmark_genai/README.md create mode 100755 samples/python/benchmark_genai/benchmark_genai.py create mode 100644 src/cpp/include/openvino/genai/perf_metrics.hpp create mode 100644 src/cpp/src/perf_metrics.cpp diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index b73d907b87..0000000000 --- a/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -FROM ubuntu:22.04 - -ARG JOBS -WORKDIR /workspace -RUN apt-get update -y && apt-get install -y python3-pip python3-venv git - -# Install OpenVINO -RUN git clone --branch master https://github.com/openvinotoolkit/openvino.git && \ - cd /workspace/openvino && \ - git submodule update --init -- /workspace/openvino/thirdparty/xbyak /workspace/openvino/thirdparty/pugixml /workspace/openvino/thirdparty/open_model_zoo \ - /workspace/openvino/thirdparty/protobuf /workspace/openvino/thirdparty/snappy /workspace/openvino/thirdparty/telemetry /workspace/openvino/src/plugins/intel_cpu/thirdparty/mlas \ - /workspace/openvino/src/plugins/intel_cpu/thirdparty/onednn /workspace/openvino/src/bindings/python/thirdparty/pybind11 && cd - - -RUN /workspace/openvino/install_build_dependencies.sh -RUN python3 -m pip install -r /workspace/openvino/src/bindings/python/wheel/requirements-dev.txt -RUN cmake -DENABLE_PYTHON=ON -DENABLE_PYTHON_PACKAGING=ON -DENABLE_WHEEL=ON -DENABLE_CPPLINT=OFF -DENABLE_SAMPLES=OFF -DENABLE_INTEL_GPU=OFF \ - -DENABLE_INTEL_NPU=OFF -DENABLE_TEMPLATE=OFF -DENABLE_AUTO=OFF -DENABLE_HETERO=OFF -DENABLE_AUTO_BATCH=OFF -DENABLE_OV_TF_FRONTEND=ON -DENABLE_OV_ONNX_FRONTEND=OFF \ - -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF -S /workspace/openvino -B /workspace/openvino_build -RUN cmake --build /workspace/openvino_build --parallel $JOBS -RUN cmake -P /workspace/openvino_build/cmake_install.cmake -RUN python3 -m pip install /workspace/openvino_build/wheels/openvino-2024* -ENV OpenVINO_DIR=/workspace/openvino_build - -# Download dataset -RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -# Build GenAI library with dependencies -RUN git clone https://github.com/Wovchena/openvino.genai-public.git -b reuse-Tokenizer openvino.genai && \ - cd /workspace/openvino.genai/thirdparty && git submodule update --remote --init && \ - mkdir /workspace/openvino.genai/build && cd /workspace/openvino.genai/build && \ - cmake -DCMAKE_BUILD_TYPE=Release .. && \ - make -j${JOBS} - -# Install test dependencies -RUN python3 -m pip install --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly/ /workspace/openvino.genai/thirdparty/openvino_tokenizers -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/openvino.genai/tests/python_tests/continuous_batching/requirements.txt -ENV PYTHONPATH=/workspace/openvino.genai/build/ -ENV LD_LIBRARY_PATH=/workspace/openvino.genai/build/ diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 0839d58428..5339817c1f 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -10,6 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm) add_subdirectory(cpp/multinomial_causal_lm) add_subdirectory(cpp/prompt_lookup_decoding_lm) add_subdirectory(cpp/speculative_decoding_lm) +add_subdirectory(cpp/benchmark_genai) install(FILES requirements.txt DESTINATION samples COMPONENT cpp_samples_genai) diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt new file mode 100644 index 0000000000..5443439de5 --- /dev/null +++ b/samples/cpp/benchmark_genai/CMakeLists.txt @@ -0,0 +1,24 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + +add_executable(benchmark_genai benchmark_genai.cpp) +target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) +set_target_properties(benchmark_genai PROPERTIES + COMPILE_PDB_NAME benchmark_genai + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +install(TARGETS benchmark_genai + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md new file mode 100644 index 0000000000..616bb6a36d --- /dev/null +++ b/samples/cpp/benchmark_genai/README.md @@ -0,0 +1,47 @@ +# LLMs benchmarking sample + +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Usage + +```sh +benchmark_vanilla_genai [OPTIONS] +``` + +### Options + +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + +### Output: + +``` +benchmark_vanilla_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10 +``` + +``` +Load time: 3405.69 ms +Generate time: 1430.77 ± 3.04 ms +Tokenization time: 0.51 ± 0.02 ms +Detokenization time: 0.37 ± 0.01 ms +TTFT: 81.60 ± 0.54 ms +TPOT: 71.52 ± 2.72 ms +Throughput tokens/s: 13.98 ± 0.53 +``` + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp new file mode 100644 index 0000000000..287d6b379a --- /dev/null +++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp @@ -0,0 +1,70 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) try { + cxxopts::Options options("benchmark_vanilla_genai", "Help command"); + + options.add_options() + ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) + ("p,prompt", "Prompt", cxxopts::value()->default_value("The Sky is blue because")) + ("nw,num_warmup", "Number of warmup iterations", cxxopts::value()->default_value(std::to_string(1))) + ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(3))) + ("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value()->default_value(std::to_string(20))) + ("d,device", "device", cxxopts::value()->default_value("CPU")) + ("h,help", "Print usage"); + + cxxopts::ParseResult result; + try { + result = options.parse(argc, argv); + } catch (const cxxopts::exceptions::exception& e) { + std::cout << e.what() << "\n\n"; + std::cout << options.help() << std::endl; + return EXIT_FAILURE; + } + + if (result.count("help")) { + std::cout << options.help() << std::endl; + return EXIT_SUCCESS; + } + + std::string prompt = result["prompt"].as(); + const std::string model_path = result["model"].as(); + std::string device = result["device"].as(); + size_t num_warmup = result["num_warmup"].as(); + size_t num_iter = result["num_iter"].as(); + + ov::genai::GenerationConfig config; + config.max_new_tokens = result["max_new_tokens"].as(); + + ov::genai::LLMPipeline pipe(model_path, device); + + for (size_t i = 0; i < num_warmup; i++) + pipe.generate(prompt, config); + + ov::genai::DecodedResults res = pipe.generate(prompt, config); + ov::genai::PerfMetrics metrics = res.perf_metrics; + for (size_t i = 0; i < num_iter - 1; i++) { + res = pipe.generate(prompt, config); + metrics = metrics + res.perf_metrics; + } + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl; + std::cout << "Generate time: " << metrics.get_generate_duration().mean << " ± " << metrics.get_generate_duration().std << " ms" << std::endl; + std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " ± " << metrics.get_tokenization_duration().std << " ms" << std::endl; + std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " ± " << metrics.get_detokenization_duration().std << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ± " << metrics.get_ttft().std << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ± " << metrics.get_tpot().std << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl; + + return 0; +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md index 5e80aa69da..7e412db379 100644 --- a/samples/python/beam_search_causal_lm/README.md +++ b/samples/python/beam_search_causal_lm/README.md @@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `beam_search_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md new file mode 100644 index 0000000000..9baf17c4d7 --- /dev/null +++ b/samples/python/benchmark_genai/README.md @@ -0,0 +1,47 @@ +# LLMs benchmarking sample + +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Usage + +```sh +python benchmark_genai.py [OPTIONS] +``` + +### Options + +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + +### Output: + +``` +python benchmark_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10 +``` + +``` +Load time: 3405.69 ms +Generate time: 1430.77 ± 3.04 ms +Tokenization time: 0.51 ± 0.02 ms +Detokenization time: 0.37 ± 0.01 ms +TTFT: 81.60 ± 0.54 ms +TPOT: 71.52 ± 2.72 ms +Throughput tokens/s: 13.98 ± 0.53 +``` + +For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics). diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py new file mode 100755 index 0000000000..9851483880 --- /dev/null +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -0,0 +1,49 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai as ov_genai + +def main(): + parser = argparse.ArgumentParser(description="Help command") + parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") + parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") + parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") + parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") + parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") + parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") + + args = parser.parse_args() + + # Perf metrics is stored in DecodedResults. + # In order to get DecodedResults instead of a string input should be a list. + prompt = [args.prompt] + model_path = args.model + device = args.device + num_warmup = args.num_warmup + num_iter = args.num_iter + + config = ov_genai.GenerationConfig() + config.max_new_tokens = args.max_new_tokens + + pipe = ov_genai.LLMPipeline(model_path, device) + + for _ in range(num_warmup): + pipe.generate(prompt, config) + + res = pipe.generate(prompt, config) + perf_metrics = res.perf_metrics + for _ in range(num_iter - 1): + res = pipe.generate(prompt, config) + perf_metrics += res.perf_metrics + + print(f"Load time: {perf_metrics.get_load_time():.2f} ms") + print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms") + print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms") + print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms") + print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms") + print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms") + print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s") + +if __name__ == "__main__": + main() diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md index c07023391f..66fe4b0d93 100644 --- a/samples/python/chat_sample/README.md +++ b/samples/python/chat_sample/README.md @@ -17,15 +17,25 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `chat_sample.py TinyLlama-1.1B-Chat-v1.0` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. +### Troubleshooting -## Troubleshooting -### Missing chat template +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +#### Missing chat template If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. The following template can be used as a default, but it may not work properly with every model: diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/greedy_causal_lm/README.md index 97b044eb51..1f0eb333ea 100644 --- a/samples/python/greedy_causal_lm/README.md +++ b/samples/python/greedy_causal_lm/README.md @@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `greedy_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md index d39142f3de..0778868e6a 100644 --- a/samples/python/multinomial_causal_lm/README.md +++ b/samples/python/multinomial_causal_lm/README.md @@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/src/README.md b/src/README.md index 445b88aa58..893ffb5ea9 100644 --- a/src/README.md +++ b/src/README.md @@ -5,10 +5,24 @@ It hides the complexity of the generation process and minimizes the amount of co ## Install OpenVINO™ GenAI +> **NOTE**: Please make sure that you are following the versions compatibility rules, refer to the [OpenVINO™ GenAI Dependencies](#openvino-genai-dependencies) for more information. + The OpenVINO™ GenAI flavor is available for installation via Archive and PyPI distributions. To install OpenVINO™ GenAI, refer to the [Install Guide](https://docs.openvino.ai/2024/get-started/install-openvino.html). -To build OpenVINO™ GenAI library from source, refer to the [Build Instructions](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/BUILD.md). +To build OpenVINO™ GenAI library from source, refer to the [Build Instructions](./docs/BUILD.md). + +### OpenVINO™ GenAI Dependencies + +OpenVINO™ GenAI depends on [OpenVINO](https://github.com/openvinotoolkit/openvino) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). + +When installing OpenVINO™ GenAI from PyPi, the same versions of OpenVINO and OpenVINO Tokenizers are used (e.g. `openvino==2024.3.0` and `openvino-tokenizers==2024.3.0.0` are installed for `openvino-genai==2024.3.0`). +If you update one of the dependency packages (e.g. `pip install openvino --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly`), versions might be incompatible due to different ABI and running OpenVINO GenAI can result in errors (e.g. `ImportError: libopenvino.so.2430: cannot open shared object file: No such file or directory`). +Having packages version in format `...`, only `` part of the full version can be varied to ensure ABI compatibility, while changing ``, `` or `` parts of the version might break ABI. + +GenAI, Tokenizers, and OpenVINO wheels for Linux on PyPI are compiled with `_GLIBCXX_USE_CXX11_ABI=0` to cover a wider range of platforms. In contrast, C++ archive distributions for Ubuntu are compiled with `_GLIBCXX_USE_CXX11_ABI=1`. It is not possible to mix different Application Binary Interfaces (ABIs) because doing so results in a link error. This incompatibility prevents the use of, for example, OpenVINO from C++ archive distributions alongside GenAI from PyPI. + +If you want to try OpenVINO GenAI with different dependencies versions (**not** prebuilt packages as archives or python wheels), build OpenVINO GenAI library from source. ## Usage @@ -16,16 +30,16 @@ To build OpenVINO™ GenAI library from source, refer to the [Build Instructions 1. Installed OpenVINO™ GenAI - > If OpenVINO GenAI is installed via archive distribution or built from source, you will need to install additional python dependencies (e.g. `optimum-cli` for simplified model downloading and exporting, it's not required to install [./samples/requirements.txt](./samples/requirements.txt) for deployment if the model has already been exported): - > - > ```sh - > # (Optional) Clone OpenVINO GenAI repository if it does not exist - > git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git - > cd openvino.genai - > # Install python dependencies - > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - > python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - > ``` + > To use OpenVINO GenAI with models that are already in OpenVINO format, no additional python dependencies are needed. To + > convert models with optimum-cli and to run the examples, install the dependencies in [./samples/requirements.txt](./samples/requirements.txt): + ```sh + # (Optional) Clone OpenVINO GenAI repository if it does not exist + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai + # Install python dependencies + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt + ``` 2. A model in OpenVINO IR format @@ -42,7 +56,7 @@ A simple example: ```python import openvino_genai as ov_genai pipe = ov_genai.LLMPipeline(model_path, "CPU") -print(pipe.generate("The Sun is yellow because")) +print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) ``` Calling generate with custom generation config parameters, e.g. config for grouped beam search: @@ -50,7 +64,7 @@ Calling generate with custom generation config parameters, e.g. config for group import openvino_genai as ov_genai pipe = ov_genai.LLMPipeline(model_path, "CPU") -result = pipe.generate("The Sun is yellow because", max_new_tokens=30, num_beam_groups=3, num_beams=15, diversity_penalty=1.5) +result = pipe.generate("The Sun is yellow because", max_new_tokens=100, num_beam_groups=3, num_beams=15, diversity_penalty=1.5) print(result) ``` @@ -73,7 +87,7 @@ while True:     prompt = input() if prompt == 'Stop!':         break -    print(pipe(prompt)) +    print(pipe(prompt, max_new_tokens=200)) pipe.finish_chat() ``` @@ -89,7 +103,7 @@ A simple example: int main(int argc, char* argv[]) { std::string model_path = argv[1]; ov::genai::LLMPipeline pipe(model_path, "CPU"); - std::cout << pipe.generate("The Sun is yellow because"); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(256)); } ``` @@ -159,7 +173,7 @@ int main(int argc, char* argv[]) { // false means continue generation. return false; }; - std::cout << pipe.generate("The Sun is yellow bacause", ov::genai::streamer(streamer)); + std::cout << pipe.generate("The Sun is yellow bacause", ov::genai::streamer(streamer), ov::genai::max_new_tokens(200)); } ``` @@ -192,14 +206,105 @@ int main(int argc, char* argv[]) { std::string model_path = argv[1]; ov::genai::LLMPipeline pipe(model_path, "CPU"); - std::cout << pipe.generate("The Sun is yellow because", ov::genai::streamer(custom_streamer)); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::streamer(custom_streamer), ov::genai::max_new_tokens(200)); } ``` +### Performance Metrics + +`openvino_genai.PerfMetrics` (referred as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` holds fields with mean and standard deviations for the following metrics: +- Time To the First Token (TTFT), ms +- Time per Output Token (TPOT), ms/token +- Generate total duration, ms +- Tokenization duration, ms +- Detokenization duration, ms +- Throughput, tokens/s + +and: +- Load time, ms +- Number of generated tokens +- Number of tokens in the input prompt + +Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `openvino_genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`. + +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +perf_metrics = result.perf_metrics + +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms') +print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.get_throughput()get_.mean():.2f} tokens/s') +``` + +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + auto result = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto perf_metrics = result.perf_metrics; + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " tokens/s" << std::endl; +} +``` +output: +```sh +mean_generate_duration: 76.28 +mean_ttft: 42.58 +mean_tpot 3.80 +``` + +>**Note**: If the input prompt is just a string, the generate function returns only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs. + +Several `perf_metrics` can be added to each other. In that case `raw_metrics` are concatenated and mean/std values are recalculated. This accumulates statistics from several `generate()` calls + +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + auto result_1 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto result_2 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto perf_metrics = result_1.perf_metrics + result_2.perf_metrics + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " tokens/s" << std::endl; +} +``` + +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20) +perf_metrics = res_1.perf_metrics + res_2.perf_metrics + +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms') +print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') +``` + +For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/benchmark_genai/README.md) samples. + ## How It Works -For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/HOW_IT_WORKS.md). +For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](./docs/HOW_IT_WORKS.md). ## Supported Models -For a list of supported models, refer to the [Supported Models Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/SUPPORTED_MODELS.md). +For a list of supported models, refer to the [Supported Models Section](./docs/SUPPORTED_MODELS.md). diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index f5f8c53309..626a51c5da 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -10,6 +10,8 @@ #include "openvino/genai/tokenizer.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/generation_handle.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/streamer_base.hpp" #include "openvino/genai/visibility.hpp" namespace ov::genai { @@ -56,13 +58,27 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { PipelineMetrics get_metrics() const; - GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params); + GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params); + GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params); void step(); bool has_non_finished_requests(); // more high level interface, which can process multiple prompts in continuous batching manner - std::vector generate(const std::vector& prompts, std::vector sampling_params); + std::vector generate(const std::vector& input_ids, const std::vector& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{}); + std::vector generate(const std::vector& prompts, const std::vector& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{}); + + /** + * @brief start chat with keeping history in kv cache. + * + * @param system_message optional system message. + */ + void start_chat(const std::string& system_message = ""); + + /** + * @brief finish chat and clear kv cache. + */ + void finish_chat(); }; } diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp index d0ddbc3a32..8d00ae0e9b 100644 --- a/src/cpp/include/openvino/genai/generation_handle.hpp +++ b/src/cpp/include/openvino/genai/generation_handle.hpp @@ -18,6 +18,20 @@ enum class GenerationStatus { DROPPED_BY_HANDLE = 4 // Status set when generation handle is dropped }; +struct EncodedGenerationResult { + // request ID - obsolete when handle API is approved as handle will connect results with prompts. + uint64_t m_request_id; + + // in a generic case we have multiple generation results per initial prompt + // depending on sampling parameters (e.g. beam search or parallel sampling) + std::vector> m_generation_ids; + // scores + std::vector m_scores; + + // Status of generation + GenerationStatus m_status = GenerationStatus::RUNNING; +}; + struct GenerationResult { // request ID - obsolete when handle API is approved as handle will connect results with prompts. uint64_t m_request_id; @@ -60,6 +74,7 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { bool can_read(); + GenerationOutputs back(); // Reads result of a generation for single iteration GenerationOutputs read(); // Reads all generated tokens for all sequences diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 84dc02bd58..4be298128e 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -5,11 +5,13 @@ #include #include +#include #include "openvino/core/any.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/tokenizer.hpp" #include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/perf_metrics.hpp" namespace ov { namespace genai { @@ -29,11 +31,13 @@ using StringInputs = std::variant>; * * @param tokens sequence of resulting tokens * @param scores sum of logarithmic probabilities of all tokens in the sequence +* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics */ class EncodedResults { public: std::vector> tokens; std::vector scores; + PerfMetrics perf_metrics; }; /** @@ -42,11 +46,13 @@ class EncodedResults { * * @param texts vector of resulting sequences * @param scores scores for each sequence +* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics */ class DecodedResults { public: std::vector texts; std::vector scores; + PerfMetrics perf_metrics; // @brief Convert DecodedResults to a string. operator std::string() const { diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp new file mode 100644 index 0000000000..ad53d8d941 --- /dev/null +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -0,0 +1,149 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "openvino/genai/visibility.hpp" +#include +#include +#include + +namespace ov { +namespace genai { + +using TimePoint = std::chrono::steady_clock::time_point; +using MicroSeconds = std::chrono::duration>; + +/** + * @brief Structure with raw performance metrics for each generation before any statistics are calculated. + * + * @param generate_durations Durations for each generate call in microseconds. + * @param tokenization_durations Durations for the tokenization process in microseconds. + * @param detokenization_durations Durations for the detokenization process in microseconds. + * @param m_times_to_first_token Times to the first token for each call in microseconds. + * @param m_new_token_times Time points for each new token generated. + * @param m_batch_sizes Batch sizes for each generate call. + * @param m_durations Total durations for each generate call in microseconds. + * @param num_generated_tokens Total number of tokens generated. + * @param num_input_tokens Total number of tokens in the input prompt. + */ +struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { + std::vector generate_durations; + std::vector tokenization_durations; + std::vector detokenization_durations; + + std::vector m_times_to_first_token; + std::vector m_new_token_times; + std::vector m_batch_sizes; + std::vector m_durations; + + size_t num_generated_tokens; + size_t num_input_tokens; +}; + +/** +* @brief Structure to store mean and standart deviation values. +*/ +struct OPENVINO_GENAI_EXPORTS MeanStdPair { + float mean; + float std; +}; + +/** + * @brief Holds performance metrics for each generate call. + * + * PerfMetrics holds fields with mean and standard deviations for the following metrics: + * - Time To the First Token (TTFT), ms + * - Time per Output Token (TPOT), ms/token + * - Generate total duration, ms + * - Tokenization duration, ms + * - Detokenization duration, ms + * - Throughput, tokens/s + * + * Additional fields include: + * - Load time, ms + * - Number of generated tokens + * - Number of tokens in the input prompt + * + * Preverable way to access values is via get functions. Getters calculate mean and std values from raw_metrics are return pairs. + * If mean and std were already calcualted getters return cached values. + * @param get_load_time Returns the load time in milliseconds. + * @param get_num_generated_tokens Returns the number of generated tokens. + * @param get_num_input_tokens Returns the number of tokens in the input prompt. + * @param get_ttft Returns the mean and standard deviation of TTFT. + * @param get_tpot Returns the mean and standard deviation of TPOT. + * @param get_throughput Returns the mean and standard deviation of throughput. + * @param get_generate_duration Returns the mean and standard deviation of generate duration. + * @param get_tokenization_duration Returns the mean and standard deviation of tokenization duration. + * @param get_detokenization_duration Returns the mean and standard deviation of detokenization duration. + * @param get_microsec Converts a duration to microseconds. + * @param m_evaluated Flag indicating if raw metrics were evaluated. + * If false, current mean/std TTFT, TPOT, etc. are not actual and evaluate_statistics() should recalculate them. + * @param evaluate_statistics Calculates mean and standard deviation values from raw_metrics. + * Optional start_time can be provided to update durations. + * @param operator+ Adds two PerfMetrics objects. + * @param operator+= Adds and assigns the right-hand PerfMetrics to the current object. + * @param raw_metrics A structure of RawPerfMetrics type that holds raw metrics. + * @param load_time Load time in milliseconds. + * + * Cached mean and standard deviations. + * @param ttft Mean and standard deviation of Time to the First Token (TTFT) in milliseconds. + * @param tpot Mean and standard deviation of Time per Output Token (TPOT) in milliseconds per token. + * @param throughput Mean and standard deviation of tokens per second. + * @param generate_duration Mean and standard deviation of the total duration of generate calls in milliseconds. + * @param tokenization_duration Mean and standard deviation of the tokenization duration in milliseconds. + * @param detokenization_duration Mean and standard deviation of the detokenization duration in milliseconds. + * @param num_generated_tokens Number of generated tokens. + * @param num_input_tokens Number of tokens in the input prompt. + */ +struct OPENVINO_GENAI_EXPORTS PerfMetrics { + float load_time; // Load time in ms. + MeanStdPair ttft; // Time to the first token (in ms) (TTTFT). + MeanStdPair tpot; // Time (in ms) per output token (TPOT). + MeanStdPair throughput; // Tokens per second. + + MeanStdPair generate_duration; + MeanStdPair tokenization_duration = {-1, -1}; + MeanStdPair detokenization_duration = {-1. -1}; + + size_t num_generated_tokens; + size_t num_input_tokens; + + float get_load_time(); // Load time in ms. + float get_num_generated_tokens(); + float get_num_input_tokens(); + MeanStdPair get_ttft(); // Time to the first token (in ms) (TTTFT). + MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT). + MeanStdPair get_throughput(); // Tokens per second. + + MeanStdPair get_generate_duration(); + MeanStdPair get_tokenization_duration(); + MeanStdPair get_detokenization_duration(); + + // Flag indicating if raw metrics were evaluated. + // If false means current mean/std ttft, tpot, etc. are not actual + // and evaluate_statistics() should recalculate them. + bool m_evaluated = false; + + /** + * @brief calculates mean/std values from raw_metrics. + * + * @param start_time optional start_time in case if duration needs to be updated. + */ + void evaluate_statistics(std::optional start_time = std::nullopt); + + /** + * @brief convert duration to microseconds + * + * @param duration duration in + */ + static float get_microsec(std::chrono::steady_clock::duration duration); + PerfMetrics operator+(const PerfMetrics& metrics) const; + PerfMetrics& operator+=(const PerfMetrics& right); + + RawPerfMetrics raw_metrics; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp index d9bf7a7b41..aca823fa63 100644 --- a/src/cpp/include/openvino/genai/scheduler_config.hpp +++ b/src/cpp/include/openvino/genai/scheduler_config.hpp @@ -16,7 +16,7 @@ struct SchedulerConfig { std::size_t num_kv_blocks = 0; // total size of KV cache in GB - std::size_t cache_size = 0; + std::size_t cache_size = 1; // block size for KV cache std::size_t block_size = 32; diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp index 3e80217f14..d9815610c5 100644 --- a/src/cpp/src/block_manager.hpp +++ b/src/cpp/src/block_manager.hpp @@ -558,7 +558,7 @@ class BlockManager { block->set_timestamp(std::chrono::system_clock::now()); m_block_table[seq_id].push_back(block); group->update_processed_tokens_num(prev_iteration_content_len + i); - + size_t new_tokens_count_in_block = std::min(content_len, prev_iteration_content_len + block_size); if (new_tokens_count_in_block > prev_iteration_content_len + i) { cached_blocks.erase(hash); diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index 55100f3cb4..a66a88cad4 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -6,16 +6,21 @@ #include #include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_handle.hpp" #include "openvino/genai/tokenizer.hpp" #include "cache_manager.hpp" #include "sampler.hpp" #include "model_runner.hpp" #include "scheduler.hpp" +#include "text_callback_streamer.hpp" #include "timer.hpp" #include "debug_utils.hpp" using namespace ov::genai; +template struct overloaded : Ts... {using Ts::operator()...;}; +template overloaded(Ts...) -> overloaded; + void apply_paged_attention_transformations(std::shared_ptr model, DeviceConfig& device_config); class ContinuousBatchingPipeline::Impl { @@ -51,6 +56,8 @@ class ContinuousBatchingPipeline::Impl { std::vector m_awaiting_requests; // Mutex protecting access to m_awaiting_requests, so add_request and step methods can be called from different threads std::mutex m_awaiting_requests_mutex; + bool m_is_chat_conversation = false; + ChatHistory m_history; void _free_non_running_requests() { @@ -120,18 +127,9 @@ class ContinuousBatchingPipeline::Impl { return m_tokenizer; } - GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) { + GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, ov::genai::GenerationConfig sampling_params) { sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id()); sampling_params.validate(); - - ov::Tensor input_ids; - { - static ManualTimer timer("tokenize"); - timer.start(); - input_ids = m_tokenizer.encode(prompt).input_ids; - timer.end(); - } - SequenceGroup::Ptr sequence_group = std::make_shared(request_id, input_ids, sampling_params, m_scheduler->get_config().block_size); { @@ -141,6 +139,14 @@ class ContinuousBatchingPipeline::Impl { return std::make_unique(sequence_group->get_generation_stream(), sampling_params); } + GenerationHandle add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params) { + static ManualTimer timer("tokenize"); + timer.start(); + ov::Tensor input_ids = m_tokenizer.encode(prompt).input_ids; + timer.end(); + return add_request(request_id, input_ids, sampling_params); + } + void step() { static ManualTimer step_timer("step()"); step_timer.start(); @@ -238,25 +244,47 @@ class ContinuousBatchingPipeline::Impl { return !m_awaiting_requests.empty() || !m_requests.empty(); } - std::vector generate(const std::vector prompts, std::vector sampling_params) { + std::vector generate(const std::vector& input_ids, const std::vector& sampling_params, const StreamerVariant& streamer) { OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request"); - OPENVINO_ASSERT(prompts.size() == sampling_params.size()); + OPENVINO_ASSERT(input_ids.size() == sampling_params.size()); + const std::shared_ptr& streamer_ptr = std::visit(overloaded{ + [](std::monostate) -> std::shared_ptr { + return nullptr; + }, + [](const std::shared_ptr& streamer) { + return streamer; + }, + [this](const std::function& streamer) -> std::shared_ptr { + return std::make_unique(m_tokenizer, streamer); + } + }, streamer); std::vector generations; - for (size_t request_id = 0; request_id < prompts.size(); ++request_id) { - generations.push_back(add_request(request_id, prompts[request_id], sampling_params[request_id])); + for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) { + OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); + generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id])); } - std::vector results; + std::vector results; results.reserve(m_awaiting_requests.size()); - while (has_non_finished_requests()) { + bool continue_generation = true; + while (has_non_finished_requests() && continue_generation) { step(); + if (streamer_ptr) { + std::unordered_map token = generations.at(0).get()->back(); + OPENVINO_ASSERT(1 == token.size()); + OPENVINO_ASSERT(1 == token.begin()->second.generated_token_ids.size()); + continue_generation = !streamer_ptr->put(token.begin()->second.generated_token_ids.at(0)); + } + } + if (streamer_ptr) { + streamer_ptr->end(); } for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) { const auto& generation = generations[generation_idx]; - GenerationResult result; + EncodedGenerationResult result; result.m_request_id = 1; std::vector generation_outputs = generation->read_all(); std::sort(generation_outputs.begin(), generation_outputs.end(), [=] (GenerationOutput& r1, GenerationOutput& r2) { @@ -266,17 +294,69 @@ class ContinuousBatchingPipeline::Impl { auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size()); for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) { const auto& generation_output = generation_outputs[generation_output_idx]; - std::string output_text = m_tokenizer.decode(generation_output.generated_token_ids); - result.m_generation_ids.push_back(output_text); + result.m_generation_ids.push_back(std::move(generation_output.generated_token_ids)); result.m_scores.push_back(generation_output.score); } result.m_status = generation->get_status(); - results.push_back(result); + results.push_back(std::move(result)); } - OPENVINO_ASSERT(results.size() == prompts.size()); + OPENVINO_ASSERT(results.size() == input_ids.size()); return results; } + + std::vector generate(const std::vector& prompts, std::vector sampling_params, const StreamerVariant& streamer) { + std::vector input_ids; + static ManualTimer timer("tokenize"); + if (m_is_chat_conversation) { + OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts"); + m_history.push_back({{"role", "user"}, {"content", prompts.at(0)}}); + constexpr bool add_generation_prompt = true; + std::string history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + timer.start(); + input_ids.push_back(m_tokenizer.encode(history).input_ids); + timer.end(); + } else { + input_ids.reserve(prompts.size()); + for (const std::string& prompt : prompts) { + timer.start(); + input_ids.push_back(m_tokenizer.encode(prompt).input_ids); + timer.end(); + } + } + std::vector encoded = generate(input_ids, sampling_params, streamer); + std::vector decoded; + decoded.reserve(encoded.size()); + for (EncodedGenerationResult& res : encoded) { + std::vector generated; + generated.reserve(res.m_generation_ids.size()); + for (size_t idx = 0; idx < res.m_generation_ids.size(); ++idx) { + generated.push_back(m_tokenizer.decode(res.m_generation_ids.at(idx))); + if (m_is_chat_conversation && 0 == idx) { + m_history.push_back({{"role", "assistant"}, {"content", generated.back()}}); + } + } + decoded.push_back(GenerationResult{ + res.m_request_id, + std::move(generated), + std::move(res.m_scores), + res.m_status + }); + } + return decoded; + } + + void start_chat(const std::string& system_message) { + if (!system_message.empty()) { + m_history.push_back({{"role", "system"}, {"content", system_message}}); + } + m_is_chat_conversation = true; + }; + + void finish_chat() { + m_is_chat_conversation = false; + m_history.clear(); + }; }; ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& models_path, @@ -307,10 +387,14 @@ PipelineMetrics ContinuousBatchingPipeline::get_metrics() const{ return m_impl->get_metrics(); } -GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) { +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params) { return m_impl->add_request(request_id, prompt, sampling_params); } +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params) { + return m_impl->add_request(request_id, input_ids, sampling_params); +} + void ContinuousBatchingPipeline::step() { m_impl->step(); } @@ -319,6 +403,18 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() { return m_impl->has_non_finished_requests(); } -std::vector ContinuousBatchingPipeline::generate(const std::vector& prompts, std::vector sampling_params) { - return m_impl->generate(prompts, sampling_params); -} \ No newline at end of file +std::vector ContinuousBatchingPipeline::generate(const std::vector& input_ids, const std::vector& sampling_params, const StreamerVariant& streamer) { + return m_impl->generate(input_ids, sampling_params, streamer); +} + +std::vector ContinuousBatchingPipeline::generate(const std::vector& prompts, const std::vector& sampling_params, const StreamerVariant& streamer) { + return m_impl->generate(prompts, sampling_params, streamer); +} + +void ContinuousBatchingPipeline::start_chat(const std::string& system_message) { + m_impl->start_chat(system_message); +}; + +void ContinuousBatchingPipeline::finish_chat() { + m_impl->finish_chat(); +}; diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp index a0187025ec..26cc12604f 100644 --- a/src/cpp/src/generation_handle.cpp +++ b/src/cpp/src/generation_handle.cpp @@ -20,6 +20,10 @@ bool GenerationHandleImpl::can_read() { return m_generation_stream->can_read(); } +std::unordered_map GenerationHandleImpl::back() { + return m_generation_stream->back(); +} + std::unordered_map GenerationHandleImpl::read() { return m_generation_stream->read(); } diff --git a/src/cpp/src/generation_stream.hpp b/src/cpp/src/generation_stream.hpp index 0d51897e82..1ac2eefef9 100644 --- a/src/cpp/src/generation_stream.hpp +++ b/src/cpp/src/generation_stream.hpp @@ -31,6 +31,9 @@ class GenerationStream { } // Retriving vector of pairs as we can generate multiple outputs for a single prompt + GenerationOutputs back() { + return m_output_queue.back(); + } GenerationOutputs read() { return m_output_queue.pull(); } diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 9170c7d2f9..8dc56b4ba8 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/perf_metrics.hpp" #include "utils.hpp" namespace ov { @@ -19,12 +19,16 @@ EncodedResults greedy_decoding( const size_t batch_size = prompts_shape[0]; size_t running_batch_size = batch_size; size_t prompt_len = prompts_shape[1]; + size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len); + // Initialize results and performance metrics. EncodedResults results; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; + results.scores.resize(running_batch_size); results.tokens.resize(running_batch_size); std::fill(results.scores.begin(), results.scores.end(), 0); - + m_model_runner.set_tensor("input_ids", input_ids); m_model_runner.set_tensor("attention_mask", attention_mask); if (position_ids.has_value()) @@ -50,6 +54,9 @@ EncodedResults greedy_decoding( eos_met[batch] = (out_token == generation_config.eos_token_id); m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + if (streamer && streamer->put(token_iter_results[0])) { return results; } @@ -58,8 +65,8 @@ EncodedResults greedy_decoding( if (!generation_config.ignore_eos && all_are_eos) return results; - size_t max_tokens = generation_config.get_max_new_tokens(prompt_len); - for (size_t i = 0; i < max_tokens - 1; ++i) { + + for (size_t i = 0; i < max_new_tokens - 1; ++i) { if (position_ids.has_value()) utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); @@ -80,6 +87,8 @@ EncodedResults greedy_decoding( m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); if (streamer && streamer->put(token_iter_results[0])) return results; @@ -106,8 +115,9 @@ EncodedResults greedy_decoding( if (streamer) { streamer->end(); } + return results; } } //namespace genai -} //namespace ov \ No newline at end of file +} //namespace ov diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index 8695aeac02..1b9729b2f6 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -362,14 +362,15 @@ std::pair beam_search(ov::InferRequest& lm, std::optional selected_beam_idx) { OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups"); - - // Initialize beam search + auto batch_size = input_ids.get_shape().at(0); + auto sequence_length = input_ids.get_shape().at(1); + + // Initialize beam search. const int64_t* prompt_data = input_ids.data(); std::vector> prompts; prompts.reserve(batch_size); for (size_t batch = 0; batch < batch_size; batch++) { - size_t sequence_length = input_ids.get_shape().at(1); size_t batch_offset = batch * sequence_length; const int64_t* prompt_start = prompt_data + batch_offset; prompts.push_back(std::vector{prompt_start, prompt_start + sequence_length}); @@ -389,7 +390,7 @@ std::pair beam_search(ov::InferRequest& lm, lm.set_tensor("beam_idx", beam_idx); Parameters parameters{std::move(prompts)}; - parameters.max_new_tokens = config.max_new_tokens; + parameters.max_new_tokens = config.get_max_new_tokens(sequence_length); parameters.eos_token_id = config.eos_token_id; parameters.n_groups = config.num_beam_groups; parameters.group_size = config.num_beams / config.num_beam_groups; @@ -401,11 +402,20 @@ std::pair beam_search(ov::InferRequest& lm, std::vector next_tokens; std::vector next_beams; - + + // Reserve for performance counters. + std::vector new_token_times; + std::vector batch_sizes; + new_token_times.reserve(parameters.max_new_tokens); + batch_sizes.reserve(parameters.max_new_tokens); + for (size_t length_count = 0; ; ++length_count) { lm.infer(); std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); + new_token_times.emplace_back(std::chrono::steady_clock::now()); + batch_sizes.emplace_back(batch_size); + if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) { // Break the cycle before masks are extended in update_attention_mask_with_beams. // If generation is continued, attention_mask length should be equal to KV cache size. @@ -434,6 +444,9 @@ std::pair beam_search(ov::InferRequest& lm, int32_t res_selected_beam_idx = 0; results.scores.reserve(config.num_return_sequences * result.size()); results.tokens.reserve(config.num_return_sequences * result.size()); + auto& raw_perf_counters = results.perf_metrics.raw_metrics; + raw_perf_counters.m_new_token_times = new_token_times; + raw_perf_counters.m_batch_sizes = batch_sizes; // align output with HF for (size_t prompt_id = 0; prompt_id < result.size(); prompt_id++) { @@ -462,7 +475,7 @@ std::pair beam_search(ov::InferRequest& lm, results.tokens.push_back(std::move(beam->get().tokens)); } } - + return {results, res_selected_beam_idx}; } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 1594dbd583..b121fe9e6d 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -7,8 +7,10 @@ #include #include #include +#include "openvino/genai/continuous_batching_pipeline.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/perf_metrics.hpp" #include "llm_pipeline_base.hpp" #include "llm_pipeline_static.hpp" #include "utils.hpp" @@ -110,10 +112,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { OptionalGenerationConfig generation_config, StreamerVariant streamer ) override { + auto start_time = std::chrono::steady_clock::now(); GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; - EncodedInputs encoded_input; + TokenizedInputs encoded_input; if (auto input_vector = std::get_if>(&inputs)) { + OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts"); encoded_input = m_tokenizer.encode(*input_vector); } else if (auto input_prompt = std::get_if(&inputs)) { std::string& prompt = *input_prompt; @@ -143,9 +147,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { encoded_input = m_tokenizer.encode(prompt); } } + auto encode_stop_time = std::chrono::steady_clock::now(); + auto encoded_results = generate(encoded_input, config, streamer); - auto encoded_results = generate(encoded_input, config, streamer); + auto decode_start_time = std::chrono::steady_clock::now(); DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + auto decode_stop_time = std::chrono::steady_clock::now(); if (is_chat_conversation) { // Tail of chat template is missing in KV cache. @@ -155,6 +162,17 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { m_history.push_back({{"role", "assistant"}, {"content", answer}}); } + // generate_durations + decoded_results.perf_metrics = encoded_results.perf_metrics; + + auto& raw_counters = decoded_results.perf_metrics.raw_metrics; + auto stop_time = std::chrono::steady_clock::now(); + raw_counters.generate_durations = std::vector(); + raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time)); + raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time)); + + decoded_results.perf_metrics.evaluate_statistics(start_time); return decoded_results; } @@ -163,9 +181,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { OptionalGenerationConfig generation_config, StreamerVariant streamer ) override { + auto start_time = std::chrono::steady_clock::now(); ov::Tensor input_ids; ov::Tensor attention_mask; - if (auto data = std::get_if(&inputs)) { input_ids = *data; attention_mask = ov::genai::utils::init_attention_mask(input_ids); @@ -253,7 +271,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } else { m_is_cache_empty = false; } - + auto stop_time = std::chrono::steady_clock::now(); + + // If is called without tokenization then that stat will not be reported. + auto& metrics = result.perf_metrics; + metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); + metrics.load_time = this->m_load_time_ms; + metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + metrics.evaluate_statistics(start_time); return result; } @@ -335,14 +360,161 @@ std::pair generation_config(const GenerationConfig& config) { } // namespace genai } // namespace ov -using namespace std; +namespace { +using namespace ov::genai; + +template struct overloaded : Ts... {using Ts::operator()...;}; +template overloaded(Ts...) -> overloaded; + +Tokenizer dont_construct() { + OPENVINO_THROW("Continuous Batching backend can't be constructed" + "from ireq because the model must be transformed"); +} + +class ContinuousBatchingAdapter final : public LLMPipelineImplBase { +public: + ContinuousBatchingPipeline m_impl; + + ContinuousBatchingAdapter( + const ov::InferRequest& request, + const Tokenizer& tokenizer, + OptionalGenerationConfig generation_config + ): LLMPipelineImplBase{dont_construct()}, m_impl{"", {}} {} + + ContinuousBatchingAdapter( + const std::filesystem::path& model_path, + const Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config + ): LLMPipelineImplBase{tokenizer}, m_impl{ + model_path.string(), + tokenizer, + SchedulerConfig{}, + device, + plugin_config + } {} + + ContinuousBatchingAdapter( + const std::filesystem::path& model_path, + const std::string& device, + const ov::AnyMap& plugin_config + ): LLMPipelineImplBase{Tokenizer(model_path.string())}, m_impl{ + model_path.string(), + m_tokenizer, + SchedulerConfig{}, + device, + plugin_config + } {} + + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override { + std::vector prompts = std::visit(overloaded{ + [](const std::string& prompt) { + return std::vector{prompt}; + }, + [](std::vector& prompts) { + return prompts; + } + }, inputs); + const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config; + // -1 == config.eos_token_id and config.validate() are handled in m_impl. + std::vector generated = m_impl.generate( + prompts, + std::vector{prompts.size(), config}, + streamer + ); + std::vector plain_replies; + std::vector plain_scores; + for (GenerationResult& res : generated) { + if (GenerationStatus::FINISHED != res.m_status) { + OPENVINO_THROW("Got unfinished GenerationStatus"); + } + std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies)); + std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores)); + } + return {std::move(plain_replies), std::move(plain_scores)}; + } + + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override { + std::vector input_ids = std::visit(overloaded{ + [](const ov::Tensor& inp) { + size_t batch_size = inp.get_shape().at(0); + if (1 == batch_size) { + return std::vector{inp}; + } + std::vector input_ids; + input_ids.reserve(batch_size); + size_t max_len = inp.get_shape().at(1); + const int64_t* const source = inp.data(); + for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) { + input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len)); + int64_t* destination = input_ids.back().data(); + std::copy_n(source + batch_id * max_len, max_len, destination); + } + return input_ids; + }, + [](const TokenizedInputs& inp) { + size_t batch_size = inp.input_ids.get_shape().at(0); + std::vector input_ids; + input_ids.reserve(batch_size); + size_t max_len = inp.input_ids.get_shape().at(1); + const int64_t* const source = inp.input_ids.data(); + const int64_t* const attention_mask = inp.attention_mask.data(); + for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) { + input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len)); + int64_t* destination = input_ids.back().data(); + size_t copy_count = 0; + for (size_t idx = 0; idx < max_len; ++idx) { + if (1 == attention_mask[batch_id * max_len + idx]) { + destination[copy_count++] = source[batch_id * max_len + idx]; + } + } + input_ids.back().set_shape({1, copy_count}); + } + return input_ids; + } + }, inputs); + const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config; + // -1 == config.eos_token_id and config.validate() are handled in m_impl. + std::vector generated = m_impl.generate(input_ids, std::vector{input_ids.size(), config}, streamer); + std::vector> plain_tokens; + std::vector plain_scores; + for (EncodedGenerationResult& res : generated) { + if (GenerationStatus::FINISHED != res.m_status) { + OPENVINO_THROW("Got unfinished GenerationStatus"); + } + std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens)); + std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores)); + } + return {std::move(plain_tokens), std::move(plain_scores)}; + } + + void start_chat(const std::string& system_message) override { + m_impl.start_chat(); + }; + + void finish_chat() override { + m_impl.finish_chat(); + }; +}; +} ov::genai::LLMPipeline::LLMPipeline( const ov::InferRequest& request, const ov::genai::Tokenizer& tokenizer, OptionalGenerationConfig generation_config ) { + auto start_time = std::chrono::steady_clock::now(); m_pimpl = std::make_unique(request, tokenizer, generation_config); + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } ov::genai::LLMPipeline::LLMPipeline( @@ -350,24 +522,34 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& plugin_config -) { - if (device == "NPU") { - m_pimpl = make_unique(std::filesystem::path(model_path), tokenizer, device, plugin_config); +){ + auto start_time = std::chrono::steady_clock::now(); + if ("CB" == device) { + m_pimpl = std::make_unique(model_path, tokenizer, "CPU", plugin_config); + } else if ("NPU" == device) { + m_pimpl = std::make_unique(model_path, tokenizer, device, plugin_config); } else { - m_pimpl = make_unique(std::filesystem::path(model_path), tokenizer, device, plugin_config); + m_pimpl = std::make_unique(model_path, tokenizer, device, plugin_config); } + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } ov::genai::LLMPipeline::LLMPipeline( const std::string& path, const std::string& device, const ov::AnyMap& config -) { - if (device == "NPU") { - m_pimpl = make_unique(std::filesystem::path(path), device, config); +){ + auto start_time = std::chrono::steady_clock::now(); + if ("CB" == device) { + m_pimpl = std::make_unique(path, "CPU", config); + } else if ("NPU" == device) { + m_pimpl = std::make_unique(path, device, config); } else { - m_pimpl = make_unique(std::filesystem::path(path), device, config); + m_pimpl = std::make_unique(path, device, config); } + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { @@ -387,7 +569,7 @@ void ov::genai::LLMPipeline::finish_chat() { } void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& config) { - int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;; + int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id; m_pimpl->m_generation_config = config; // if eos_token_id was not provided in config forward from default config if (config.eos_token_id == -1) diff --git a/src/cpp/src/llm_pipeline_base.hpp b/src/cpp/src/llm_pipeline_base.hpp index 9df6442b35..7e58cd3b37 100644 --- a/src/cpp/src/llm_pipeline_base.hpp +++ b/src/cpp/src/llm_pipeline_base.hpp @@ -36,6 +36,8 @@ class LLMPipelineImplBase { Tokenizer m_tokenizer; GenerationConfig m_generation_config; + + float m_load_time_ms = 0; }; } // namespace genai diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 3f50d30ec9..d05d928df6 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -12,6 +12,23 @@ namespace { +void align_u4_zp_constants(const std::shared_ptr& model) { + for (auto op : model->get_ops()) { + if (ov::op::util::is_constant(op)) { + auto cst_op = std::dynamic_pointer_cast(op); + const auto cst_op_out = cst_op->output(0); + if (cst_op_out.get_element_type() == ov::element::u4 && ov::shape_size(cst_op_out.get_shape()) == 1u) { + ov::Tensor cst_tensor(ov::element::u4, cst_op_out.get_shape()); + *static_cast(cst_tensor.data()) = cst_op->get_vector()[0] & 0x0f; + auto new_cst_op = std::make_shared(cst_tensor); + for (auto target_input : cst_op_out.get_target_inputs()) { + target_input.replace_source_output(new_cst_op); + } + } + } + } +} + std::shared_ptr add_slices_to_kvcache_inputs(const std::shared_ptr& model) { const auto kvcache_name_pattern = "past_key_values"; std::vector> new_params; @@ -145,22 +162,21 @@ StaticLLMPipeline::StaticLLMPipeline( ov::Core core; // (1) Read the template model - this will be kvcache model auto kvcache_model = core.read_model(path / "openvino_model.xml"); - // (2) Expose KV-cache input and output layers from kvcache model - ov::pass::StatefulToStateless().run_on_model(kvcache_model); + // (2) TODO: Expose KV-cache input and output layers from kvcache model // (3) Clone the model - this will be prefill - auto prefill_model = kvcache_model->clone(); - prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill"); + m_prefill_model = m_kvcache_model->clone(); + m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); // (4) Reshape both models to static shape m_kvcache_desc = KVCacheDesc { 1024u, 0u }; const uint32_t max_prompt_size = m_kvcache_desc.total_size; const uint32_t max_kvcache_size = m_kvcache_desc.total_size; - reshape_to_static(prefill_model, max_prompt_size, max_kvcache_size); - reshape_to_static(kvcache_model, 1u, max_kvcache_size); + reshape_to_static(m_prefill_model, max_prompt_size, max_kvcache_size); + reshape_to_static(m_kvcache_model, 1u, max_kvcache_size); // (5) Add slices to kvcache model - kvcache_model = add_slices_to_kvcache_inputs(kvcache_model); + m_kvcache_model = add_slices_to_kvcache_inputs(m_kvcache_model); // (6) Compile both model m_prefill_request = core.compile_model( - prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG") + m_prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG") ).create_infer_request(); m_kvcache_request = core.compile_model( kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG") diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 85488e1880..7560b7e336 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -46,6 +46,10 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { uint32_t num_stored_tokens; }; + // FIXME: Ideally, we don't need to keep those + std::shared_ptr m_kvcache_model; + std::shared_ptr m_prefill_model; + KVCacheDesc m_kvcache_desc; ov::InferRequest m_kvcache_request; ov::InferRequest m_prefill_request; diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp index fd16e948c1..b00c62aed7 100644 --- a/src/cpp/src/multinomial_decoding.cpp +++ b/src/cpp/src/multinomial_decoding.cpp @@ -162,7 +162,9 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner size_t prompt_len = prompts_shape[1]; - ov::genai::EncodedResults results; + // Initialize results and performance metrics. + EncodedResults results; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; results.scores.resize(batch_size, 0); results.tokens.resize(batch_size); @@ -179,6 +181,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("beam_idx").data()[0] = 0; m_model_runner.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); auto logits_tensor = m_model_runner.get_tensor("logits"); @@ -222,6 +226,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("input_ids").data()[0] = out_token.id; m_model_runner.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); logits = m_model_runner.get_tensor("logits").data(); out_token = sampling.get_out_token(logits, vocab_size, tokens); diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp new file mode 100644 index 0000000000..2f378ab302 --- /dev/null +++ b/src/cpp/src/perf_metrics.cpp @@ -0,0 +1,164 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/perf_metrics.hpp" +#include "openvino/openvino.hpp" +#include +#include +#include + +namespace { + +ov::genai::MeanStdPair calc_mean_and_std(const std::vector& durations) { + // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds. + float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + return acc + duration.count() / 1000.0f; + }); + mean /= durations.size(); + + float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + auto d = duration.count() / 1000.0f; + return acc + d * d; + }); + float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); + return {mean, std}; +} + + +} // namespace + +namespace ov { +namespace genai { + +float PerfMetrics::get_load_time() { + return load_time; +} + +float PerfMetrics::get_num_generated_tokens() { + evaluate_statistics(); + return num_generated_tokens; +} + +float PerfMetrics::get_num_input_tokens() { + evaluate_statistics(); + return num_generated_tokens; +} + +MeanStdPair PerfMetrics::get_ttft() { + evaluate_statistics(); + return ttft; +} + +MeanStdPair PerfMetrics::get_tpot() { + evaluate_statistics(); + return tpot; +} + +MeanStdPair PerfMetrics::get_throughput() { + evaluate_statistics(); + return throughput; +} + +MeanStdPair PerfMetrics::get_generate_duration() { + evaluate_statistics(); + return generate_duration; +} + +MeanStdPair PerfMetrics::get_tokenization_duration() { + evaluate_statistics(); + return tokenization_duration; +} + +MeanStdPair PerfMetrics::get_detokenization_duration() { + evaluate_statistics(); + return detokenization_duration; +} + +float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) { + return std::chrono::duration_cast(duration).count(); +} + +void PerfMetrics::evaluate_statistics(std::optional start_time) { + if (m_evaluated){ + return; + } + // If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that. + if (start_time.has_value()) { + auto start_time_val = *start_time; + auto& tok_times = raw_metrics.m_new_token_times; + auto& batch_sizes = raw_metrics.m_batch_sizes; + raw_metrics.m_durations = std::vector(tok_times.size()); + + auto ttft = tok_times[0] - start_time_val; + raw_metrics.m_times_to_first_token = std::vector(); + raw_metrics.m_times_to_first_token.emplace_back(ttft); + num_generated_tokens = 0; + for (size_t i = 0; i < tok_times.size(); ++i) { + raw_metrics.m_durations[i] = tok_times[i] - start_time_val; + + // If in 10 ms a batch of 5 new tokens is generated then TPOT is 10 / 5 = 2 tok/ms. + raw_metrics.m_durations[i] /= batch_sizes[i]; + num_generated_tokens += batch_sizes[i]; + start_time_val = tok_times[i]; + } + } + + // calc_mean_and_std will convert microsecond to milliseconds. + tpot = calc_mean_and_std(raw_metrics.m_durations); + ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token); + + generate_duration = calc_mean_and_std(raw_metrics.generate_durations); + tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations); + detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations); + + // tokens per second + throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)}; + m_evaluated = true; +} + +PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { + OPENVINO_ASSERT(right.load_time == load_time, "generation metrics can be accumulated only for the same pipeline"); + + // Copy left value to res. + PerfMetrics res = *this; + + // Concatenate durations, batch_sizes first token times. + auto& new_durations = res.raw_metrics.m_durations; + auto& new_batch_sizes = res.raw_metrics.m_batch_sizes; + auto& new_times_to_first_token = res.raw_metrics.m_times_to_first_token; + auto& right_durations = right.raw_metrics.m_durations; + auto& right_batch_sizes = right.raw_metrics.m_batch_sizes; + auto& right_times_to_first_token = right.raw_metrics.m_times_to_first_token; + + new_durations.insert(new_durations.end(), right_durations.begin(), right_durations.end()); + new_times_to_first_token.insert(new_times_to_first_token.end(), right_times_to_first_token.begin(), right_times_to_first_token.end()); + new_batch_sizes.insert(new_batch_sizes.end(), right_batch_sizes.begin(), right_batch_sizes.end()); + + // Concatenate tokenization/detokenization and total generation times. + auto& new_tok_durations = res.raw_metrics.tokenization_durations; + auto& new_detok_durations = res.raw_metrics.detokenization_durations; + auto& new_gen_durations = res.raw_metrics.generate_durations; + auto& right_tok_durations = right.raw_metrics.tokenization_durations; + auto& right_detok_durations = right.raw_metrics.detokenization_durations; + auto& right_gen_durations = right.raw_metrics.generate_durations; + + new_tok_durations.insert(new_tok_durations.end(), right_tok_durations.begin(), right_tok_durations.end()); + new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end()); + new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end()); + + res.num_generated_tokens = num_generated_tokens + right.num_generated_tokens; + res.num_input_tokens = num_generated_tokens + right.num_input_tokens; + res.load_time = load_time; + res.m_evaluated = false; + return res; +} + +PerfMetrics& PerfMetrics::operator+=(const PerfMetrics& right) { + *this = *this + right; + return *this; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index c52ed8d7a6..cbd6668f90 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -374,7 +374,7 @@ class Scheduler { m_block_manager.allocate(sequence, num_required_blocks, sequence_group->get_prompt_ids()); else m_block_manager.append_slots(sequence_group); - + // add information to scheduler_output { scheduler_output.m_scheduled_sequence_groups_ids.push_back(sequence_group_id); diff --git a/src/cpp/src/synchronized_queue.hpp b/src/cpp/src/synchronized_queue.hpp index 0c2cd3180d..bd025f1b7d 100644 --- a/src/cpp/src/synchronized_queue.hpp +++ b/src/cpp/src/synchronized_queue.hpp @@ -17,6 +17,12 @@ class SynchronizedQueue SynchronizedQueue(const SynchronizedQueue&&) = delete; SynchronizedQueue& operator=(const SynchronizedQueue&) = delete; + T back() { + std::unique_lock lock(m_mutex); + m_cv.wait(lock, [this]{return !m_queue.empty();}); + return m_queue.back(); + } + T pull() { std::unique_lock lock(m_mutex); m_cv.wait(lock, [this]{return !m_queue.empty();}); diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 748daa5875..44b6b30d49 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -6,6 +6,7 @@ #include "utils.hpp" #include #include +#include #include "tokenizers_path.hpp" #include "circular_buffer_queue.hpp" #include @@ -373,40 +374,31 @@ class Tokenizer::TokenizerImpl { " Please add 'chat_template' to tokenizer_config.json to use the model in chat scenario." " For more information see the section Troubleshooting in README.md"); - // Jinja2Cpp does not support slicing, e.g. [1:]. - // In templates slicing is used typically in the header to find system prompt. - // If header containts that typical expression we update template and - // extract system message manually from ChatHistory. - std::string header_with_slice = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}"; - std::string replacement_string = "{% if false %}{% set placeholder = false %}"; - - std::string system_message = ""; - size_t pos = chat_tpl.find(header_with_slice); - if (pos != std::string::npos) { - chat_tpl.replace(pos, header_with_slice.length(), replacement_string); - - if (!history.empty() && history[0].at("role") == "system") { - system_message = history[0].at("content"); - history.erase(history.begin()); - } + // Jinja2Cpp does not support Python-style slicing, e.g. [1:]. + // If chat template contains such slicing, we replace it with custom function `slice()` (user-defined callable) + // that is defined below and does the same list slicing logic. + std::string slice_string = "messages[1:]"; + std::string replacement_slice_string = "slice(messages, 1)"; + size_t slice_pos = chat_tpl.find(slice_string); + if (slice_pos != std::string::npos) { + chat_tpl.replace(slice_pos, slice_string.length(), replacement_slice_string); } - - // Jinja2Cpp accepts system_message only as a string and incorrectly handles it as a bool. - // Both this patters are found frequently in chat templates, replace so that jinja2cpp - // will not stumble on them. - std::pair replace_str_map[] = { - {"{% set system_message = false %}", ""}, - {"system_message != false", "true"}, - }; - if (!system_message.empty()) { - for (const auto& [from, to] : replace_str_map) { - size_t pos = 0; - while ((pos = chat_tpl.find(from, pos)) != std::string::npos) { - chat_tpl.replace(pos, from.size(), to); - pos += to.size(); + jinja2::UserCallable slice_callable = jinja2::MakeCallable( + [](const jinja2::ValuesList& list, const int64_t start) { + if (list.empty()) + return jinja2::Value(); + jinja2::ValuesList result; + int64_t stop = list.size(); + int64_t step = 1; + for (int64_t i = start; i < stop && i < list.size(); i += step) + { + result.push_back(list.at(i)); } - } - } + + return jinja2::Value(result); + }, + jinja2::ArgInfo{"list"}, jinja2::ArgInfo{"start"} + ); jinja2::TemplateEnv env; env.GetSettings().lstripBlocks = true; @@ -426,13 +418,13 @@ class Tokenizer::TokenizerImpl { {"bos_token", m_bos_token}, {"eos_token", m_eos_token}, {"pad_token", m_pad_token}, - {"system_message", system_message.empty() ? jinja2::EmptyValue() : jinja2::Value{system_message}}, {"add_generation_prompt", add_generation_prompt}, + {"slice", slice_callable}, }; - + try { return tpl.RenderAsString(params).value(); - } catch (const std::bad_alloc& error) { + } catch (const std::exception& error) { OPENVINO_THROW("Chat template for the current model is not supported by Jinja2Cpp. " "Please apply template manually to your prompt before calling generate. " "For exmaple: user{user_prompt}model"); diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md index 3b89995dc2..548309b7d7 100644 --- a/src/docs/BUILD.md +++ b/src/docs/BUILD.md @@ -3,73 +3,149 @@ > **NOTE**: There is a known Python API issue with `ov::Tensor`. The issue is reproduced when building OpenVINO GenAI from sources while using OpenVINO from archives. Using `ov::Tensor` with OpenVINO GenAI fails. Possible errors: `TypeError: generate(): incompatible function arguments.`, `TypeError: __init__(): incompatible constructor arguments.`, `TypeError: Unregistered type : ov::Tensor`. The preferred approach is to build both OpenVINO and OpenVINO GenAI from sources using the same build environment. Or to install prebuilt OpenVINO GenAI from [distribution channels](https://docs.openvino.ai/2024/get-started/install-openvino.html). -## Build for Linux Systems +## Software Requirements -### Software Requirements +### Linux - [CMake](https://cmake.org/download/) 3.23 or higher - GCC 7.5 or higher - Python 3.8 or higher +- Git -### Build Instructions +### Windows + +- [CMake](https://cmake.org/download/) 3.23 or higher +- Microsoft Visual Studio 2019 or higher, version 16.3 or later +- Python 3.8 or higher +- Git for Windows +- [NSIS](https://sourceforge.net/projects/nsis/) + +### macOS + +- [CMake](https://cmake.org/download/) 3.23 or higher +- [brew](https://brew.sh/) package manager to install additional dependencies: + ```sh + brew install coreutils scons + ``` +- Clang compiler and other command line tools from Xcode 10.1 or higher: + ```sh + xcode-select --install + ``` +- Python 3.8 or higher +- Git + + +## Build Instructions + +### Build OpenVINO, OpenVINO Tokenizers, and OpenVINO GenAI From Source 1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build). -The path to the openvino install directory is referred as throughout the document. +The path to the OpenVINO install directory is referred as `` throughout the document. 2. Clone OpenVINO GenAI repository and init submodules: ```sh git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git cd openvino.genai ``` -3. Build the project: +3. Set up the environment: + + #### Option 1 - using OpenVINO `setupvars` script: + + Linux and macOS: ```sh source /setupvars.sh - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --target package -j - cmake --install ./build/ --config Release --prefix ov ``` -## Build for Windows Systems + Windows Command Prompt: + ```cmd + call \setupvars.bat + ``` -### Software Requirements + Windows PowerShell: + ```cmd + . /setupvars.ps1 + ``` -- [CMake](https://cmake.org/download/) 3.23 or higher -- Microsoft Visual Studio 2019 or higher, version 16.3 or later -- Python 3.8 or higher -- Git for Windows + #### Option 2 - setting environment variables manually: -### Build Instructions + Linux: + ```sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` -1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build) -The path to the openvino install directory is referred as throughout the document. -2. Clone OpenVINO GenAI repository and init submodules: + macOS: ```sh - git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git - cd openvino.genai + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export DYLD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` + + Windows Command Prompt: + ```cmd + set OpenVINO_DIR=\runtime + set PYTHONPATH=\python;%CD%\build;%PYTHONPATH% + set OPENVINO_LIB_PATHS=\bin\intel64\Release;%OPENVINO_LIB_PATHS% + set PATH=%OPENVINO_LIB_PATHS%;%PATH% ``` -3. Build the project: + + Windows PowerShell: + ```sh + $env:OpenVINO_DIR = "\runtime" + $env:PYTHONPATH = "\python;$PWD\build;$env:PYTHONPATH" + $env:OPENVINO_LIB_PATHS = "\bin\intel64\Release;$env:OPENVINO_LIB_PATHS" + $env:PATH = "$env:OPENVINO_LIB_PATHS;$env:PATH" + ``` + +4. Build the project: ```sh - call \setupvars.bat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --target package -j - cmake --install ./build/ --config Release --prefix ov + cmake --build ./build/ --config Release -j ``` -## Build for macOS Systems +5. Install OpenVINO GenAI: -### Software Requirements + #### Option 1 - using cmake: + + The following command will store built OpenVINO GenAI artifacts along with OpenVINO in ``: -- [CMake](https://cmake.org/download/) 3.23 or higher -- [brew](https://brew.sh/) package manager to install additional dependencies: ```sh - brew install coreutils scons + cmake --install ./build/ --config Release --prefix ``` -- Clang compiler and other command line tools from Xcode 10.1 or higher: + + #### Option 2 - setting paths to built OpenVINO GenAI artifacts manually: + + The path to the OpenVINO GenAI root directory is referred as `` throughout the document. + + Linux: ```sh - xcode-select --install + export PYTHONPATH=/build/:$PYTHONPATH + export LD_LIBRARY_PATH=/build/openvino_genai/:$LD_LIBRARY_PATH + ``` + + macOS: + ```sh + export PYTHONPATH=/build:$PYTHONPATH + export DYLD_LIBRARY_PATH=/build/openvino_genai:$DYLD_LIBRARY_PATH ``` -- Python 3.8 or higher -### Build Instructions + Windows Command Prompt: + ```cmd + set PYTHONPATH=\build;%PYTHONPATH% + set PATH=\build\openvino_genai;%PATH% + ``` + + Windows PowerShell: + ```sh + $env:PYTHONPATH = "\build;$env:PYTHONPATH" + $env:PATH = "\build\openvino_genai;$env:PATH" + ``` + +To optimize the package size, you can reduce the ICU (International Components for Unicode) data size when OpenVINO Tokenizers are built as a submodule of OpenVINO GenAI. +For more information please refer to the [OpenVINO Tokenizers instructions](https://github.com/openvinotoolkit/openvino_tokenizers?tab=readme-ov-file#reducing-the-icu-data-size). + + +### Build OpenVINO GenAI Wheel 1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build) The path to the openvino install directory is referred as throughout the document. @@ -78,10 +154,54 @@ The path to the openvino install directory is referred as througho git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git cd openvino.genai ``` -3. Build the project: +2. Set up the environment: + - Option 1 - using OpenVINO `setupvars.sh` script: + ```sh + source /setupvars.sh + ``` + - Option 2 - setting environment variables manually: + ```sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` +3. Upgrade pip to ensure you have the latest version: ```sh - source /setupvars.sh - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --target package -j - cmake --install ./build/ --config Release --prefix ov + python -m pip install --upgrade pip + ``` +4. Build the wheel in the `dist` directory: + ```sh + python -m pip wheel . -w dist/ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + ``` + +### Install OpenVINO GenAI From Source + +1. Clone OpenVINO GenAI repository and init submodules: + ```sh + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai + ``` +2. Set up the environment: + - Option 1 - using OpenVINO `setupvars.sh` script: + ```sh + source /setupvars.sh + ``` + - Option 2 - setting environment variables manually: + ```sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` +3. Upgrade pip to ensure you have the latest version: + ```sh + python -m pip install --upgrade pip + ``` +4. Install the package directly from source: + ```sh + python -m pip install . + ``` +5. To verify the installation, run a simple Python script: + ```python + import openvino_genai + print(openvino_genai.__version__) ``` diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index f2dea4b830..a429fc4801 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -20,7 +20,10 @@ using ov::genai::EncodedResults; using ov::genai::GenerationConfig; using ov::genai::GenerationResult; using ov::genai::LLMPipeline; +using ov::genai::MeanStdPair; using ov::genai::OptionalGenerationConfig; +using ov::genai::PerfMetrics; +using ov::genai::RawPerfMetrics; using ov::genai::SchedulerConfig; using ov::genai::StopCriteria; using ov::genai::StreamerBase; @@ -36,6 +39,17 @@ using PyBindStreamerVariant = std::variant, std::sh template struct overloaded : Ts... { using Ts::operator()...; }; template overloaded(Ts...) -> overloaded; +template +std::vector get_ms(const T& instance, U T::*member) { + // Converts c++ duration to float so that it can be used in Python. + std::vector res; + const auto& durations = instance.*member; + res.reserve(durations.size()); + std::transform(durations.begin(), durations.end(), std::back_inserter(res), + [](const auto& duration) { return duration.count(); }); + return res; +} + namespace { auto generate_docstring = R"( @@ -88,6 +102,86 @@ auto generation_config_docstring = R"( repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. )"; +auto raw_perf_metrics_docstring = R"( + Structure with raw performance metrics for each generation before any statistics are calculated. + + :param generate_durations: Durations for each generate call in microseconds. + :type generate_durations: List[MicroSeconds] + + :param tokenization_durations: Durations for the tokenization process in microseconds. + :type tokenization_durations: List[MicroSeconds] + + :param detokenization_durations: Durations for the detokenization process in microseconds. + :type detokenization_durations: List[MicroSeconds] + + :param m_times_to_first_token: Times to the first token for each call in microseconds. + :type m_times_to_first_token: List[MicroSeconds] + + :param m_new_token_times: Time points for each new token generated. + :type m_new_token_times: List[TimePoint] + + :param m_batch_sizes: Batch sizes for each generate call. + :type m_batch_sizes: List[int] + + :param m_durations: Total durations for each generate call in microseconds. + :type m_durations: List[MicroSeconds] + + :param num_generated_tokens: Total number of tokens generated. + :type num_generated_tokens: int + + :param num_input_tokens: Total number of tokens in the input prompt. + :type num_input_tokens: int +)"; + +auto perf_metrics_docstring = R"( + Holds performance metrics for each generate call. + + PerfMetrics holds fields with mean and standard deviations for the following metrics: + - Time To the First Token (TTFT), ms + - Time per Output Token (TPOT), ms/token + - Generate total duration, ms + - Tokenization duration, ms + - Detokenization duration, ms + - Throughput, tokens/s + + Additional fields include: + - Load time, ms + - Number of generated tokens + - Number of tokens in the input prompt + + Preferable way to access values is via get functions. Getters calculate mean and std values from raw_metrics and return pairs. + If mean and std were already calculated, getters return cached values. + + :param get_load_time: Returns the load time in milliseconds. + :type get_load_time: float + + :param get_num_generated_tokens: Returns the number of generated tokens. + :type get_num_generated_tokens: int + + :param get_num_input_tokens: Returns the number of tokens in the input prompt. + :type get_num_input_tokens: int + + :param get_ttft: Returns the mean and standard deviation of TTFT. + :type get_ttft: MeanStdPair + + :param get_tpot: Returns the mean and standard deviation of TPOT. + :type get_tpot: MeanStdPair + + :param get_throughput: Returns the mean and standard deviation of throughput. + :type get_throughput: MeanStdPair + + :param get_generate_duration: Returns the mean and standard deviation of generate duration. + :type get_generate_duration: MeanStdPair + + :param get_tokenization_duration: Returns the mean and standard deviation of tokenization duration. + :type get_tokenization_duration: MeanStdPair + + :param get_detokenization_duration: Returns the mean and standard deviation of detokenization duration. + :type get_detokenization_duration: MeanStdPair + + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. + :type raw_metrics: RawPerfMetrics +)"; OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& config, const py::kwargs& kwargs) { if(!config.has_value() && kwargs.empty()) @@ -151,6 +245,33 @@ OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfi return res_config; } +ov::Any py_object_to_any(const py::object& py_obj); + +bool py_object_is_any_map(const py::object& py_obj) { + if (!py::isinstance(py_obj)) { + return false; + } + auto dict = py::cast(py_obj); + return std::all_of(dict.begin(), dict.end(), [&](const std::pair& elem) { + return py::isinstance(elem.first); + }); +} + +ov::AnyMap py_object_to_any_map(const py::object& py_obj) { + OPENVINO_ASSERT(py_object_is_any_map(py_obj), "Unsupported attribute type."); + ov::AnyMap return_value = {}; + for (auto& item : py::cast(py_obj)) { + std::string key = py::cast(item.first); + py::object value = py::cast(item.second); + if (py_object_is_any_map(value)) { + return_value[key] = py_object_to_any_map(value); + } else { + return_value[key] = py_object_to_any(value); + } + } + return return_value; +} + ov::Any py_object_to_any(const py::object& py_obj) { // Python types py::object float_32_type = py::module_::import("numpy").attr("float32"); @@ -213,6 +334,8 @@ ov::Any py_object_to_any(const py::object& py_obj) { } // OV types + } else if (py_object_is_any_map(py_obj)) { + return py_object_to_any_map(py_obj); } else if (py::isinstance(py_obj)) { return py::cast(py_obj); } else if (py::isinstance(py_obj)) { @@ -534,7 +657,45 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def(py::init<>()) .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); }) .def_readonly("scores", &DecodedResults::scores) - .def("__str__", &DecodedResults::operator std::string);; + .def_readonly("perf_metrics", &DecodedResults::perf_metrics) + .def("__str__", &DecodedResults::operator std::string); + + py::class_(m, "RawPerfMetrics", raw_perf_metrics_docstring) + .def(py::init<>()) + .def_readonly("generate_durations", &RawPerfMetrics::generate_durations) + .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::tokenization_durations); + }) + .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::detokenization_durations); + }) + .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_times_to_first_token); + }) + .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_durations); + }) + .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes) + .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens) + .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens); + + py::class_(m, "MeanStdPair") + .def(py::init<>()) + .def_readonly("mean", &MeanStdPair::mean) + .def_readonly("std", &MeanStdPair::std); + + py::class_(m, "PerfMetrics", perf_metrics_docstring) + .def(py::init<>()) + .def("get_generate_duration", &PerfMetrics::get_generate_duration) + .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration) + .def("get_detokenization_duration", &PerfMetrics::get_detokenization_duration) + .def("get_throughput", &PerfMetrics::get_throughput) + .def("get_tpot", &PerfMetrics::get_tpot) + .def("get_ttft", &PerfMetrics::get_ttft) + .def("get_load_time", &PerfMetrics::get_load_time) + .def("__add__", &PerfMetrics::operator+) + .def("__iadd__", &PerfMetrics::operator+=) + .def_readonly("raw_metrics", &PerfMetrics::raw_metrics); py::class_(m, "TokenizedInputs") .def(py::init()) @@ -543,7 +704,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "EncodedResults") .def_readonly("tokens", &EncodedResults::tokens) - .def_readonly("scores", &EncodedResults::scores); + .def_readonly("scores", &EncodedResults::scores) + .def_readonly("perf_metrics", &EncodedResults::perf_metrics); py::class_>(m, "StreamerBase") // Change the holder form unique_ptr to shared_ptr .def(py::init<>()) @@ -594,7 +756,6 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse) .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs) .def_readwrite("enable_prefix_caching", &SchedulerConfig::enable_prefix_caching); - py::class_(m, "ContinuousBatchingPipeline") .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, const std::map& tokenizer_plugin_config) { @@ -607,8 +768,22 @@ PYBIND11_MODULE(py_generate_pipeline, m) { }), py::arg("model_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({})) .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) .def("get_config", &ContinuousBatchingPipeline::get_config) - .def("add_request", &ContinuousBatchingPipeline::add_request) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request)) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request)) .def("step", &ContinuousBatchingPipeline::step) .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) - .def("generate", &ContinuousBatchingPipeline::generate); + .def( + "generate", + py::overload_cast&, const std::vector&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), + py::arg("input_ids"), + py::arg("sampling_params"), + py::arg("streamer") = std::monostate{} + ) + .def( + "generate", + py::overload_cast&, const std::vector&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), + py::arg("prompts"), + py::arg("sampling_params"), + py::arg("streamer") = std::monostate{} + ); } diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index 5468fd014b..0a4b04f880 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -367,8 +367,6 @@ TEST(TestScheduler, test_partially_preempted_prompt) { } } - - TEST(TestScheduler, prefix_caching_test) { std::array configs = {SchedulerConfig(), SchedulerConfig()}; configs.at(0).max_num_batched_tokens = 32; diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index bf76df534d..98b791443b 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -103,37 +103,35 @@ def get_chat_templates(): # TODO: Need to support chat templates in more models: CVS-145963 # Either ov_genai is unable to parse chat_template or results do not match with HF. "meta-llama/Meta-Llama-3-8B-Instruct", - "databricks/dbrx-instruct", + "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp "mosaicml/mpt-30b-chat", - "deepseek-ai/deepseek-coder-6.7b-instruct", - "maldv/winter-garden-7b-alpha", - "ishorn5/RTLCoder-Deepseek-v1.1", + "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp + "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp + "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp + "openchat/openchat-3.5-0106", "casperhansen/llama-3-70b-instruct-awq", "TheBloke/deepseek-coder-33B-instruct-GPTQ", "AI-Sweden-Models/gpt-sw3-356m-instruct", "google/gemma-7b-it", "THUDM/cogvlm2-llama3-chat-19B", "KnutJaegersberg/internlm-20b-llama", - "alpindale/WizardLM-2-8x22B", "maywell/Synatra-Mixtral-8x7B", "MediaTek-Research/Breeze-7B-Instruct-v1_0", "bofenghuang/vigostral-7b-chat", - "meetkai/functionary-small-v2.5", - "nvidia/Llama3-ChatQA-1.5-8B", + "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp "openchat/openchat-3.6-8b-20240522", "tenyx/TenyxChat-7B-v1", "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2", "yam-peleg/Hebrew-Gemma-11B-V2", - "shenzhi-wang/Llama3-8B-Chinese-Chat", + "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError "nlpai-lab/KULLM3", "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1", - "MediaTek-Research/Breeze-7B-Instruct-v0_1", - "shanchen/llama3-8B-slerp-biomed-chat-chinese", + "MediaTek-Research/Breeze-7B-Instruct-v0_1", + "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError "MLP-KTLim/llama-3-Korean-Bllossom-8B", - "lucyknada/microsoft_WizardLM-2-7B", - "aloobun/CosmicBun-8B", + "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp "codellama/CodeLlama-70b-Instruct-hf", - "gorilla-llm/gorilla-openfunctions-v2", + "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp "BramVanroy/Llama-2-13b-chat-dutch" } from tokenizer_configs import get_tokenizer_configs @@ -221,3 +219,8 @@ def load_pipe(configs: List[Tuple], temp_path): with (temp_path / config_name).open('w') as f: json.dump(config_json, f) return ov_genai.LLMPipeline(str(temp_path)) + + +@functools.lru_cache(1) +def get_continuous_batching(path): + return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB') diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py index 5a73d481d3..295674e101 100644 --- a/tests/python_tests/test_chat_generate_api.py +++ b/tests/python_tests/test_chat_generate_api.py @@ -1,6 +1,7 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import math import openvino import openvino_tokenizers import openvino_genai as ov_genai @@ -12,7 +13,8 @@ read_model, load_tok, model_tmp_path, - get_chat_templates + get_chat_templates, + get_continuous_batching, ) @@ -167,3 +169,20 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): print(f'hf reference: {full_history_str_hf}') print(f'ov_genai out: {full_history_str}') assert full_history_str == full_history_str_hf + + +@pytest.mark.parametrize("generation_config", configs[1:]) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.skip("continuous_batching seg faults with nightly ov. Ticket 147793") +def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict): + model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat')) + cb = get_continuous_batching(path) + stateful.start_chat() + cb.start_chat() + for question in quenstions: + generated = cb.generate(question, **generation_config) + reference = stateful.generate(question, **generation_config) + assert generated == reference + # Test that finish_chat() doesn't fail just in case. + cb.finish_chat() diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index b4e275eef2..fe306e2a37 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -11,6 +11,7 @@ import sys from pathlib import Path import torch +import math from ov_genai_test_utils import ( get_models_list, read_model, @@ -18,11 +19,11 @@ load_tok, model_tmp_path, STOP_CRITERIA_MAP, + get_continuous_batching, ) def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]): - device = 'CPU' model_id, path, tokenizer, model, pipe = model_descr config = generation_config.copy() # to avoid side effects num_beams = config['num_beams'] if 'num_beams' in config else 1 @@ -67,7 +68,6 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro assert hf_output == ov_output def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str): - device = 'CPU' model_id, path, tokenizer, model, pipe = model_descr config = generation_config.copy() # to avoid side effects @@ -75,7 +75,7 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str if 'do_sample' not in config: # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set exlicitly to False, but only if test arguments omitted this arg. + # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False config['repetition_penalty'] = None @@ -705,3 +705,41 @@ def test_left_pad(): models[2].pad_token = models[2].eos_token run_hf_ov_genai_comparison_batched(models, config, prompts) + + +@pytest.mark.parametrize("generation_config", test_configs) +@pytest.mark.parametrize("prompt", batched_prompts) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.skip("continuous_batching seg faults with nightly ov. Ticket 147793") +def test_continuous_batching_vs_stateful(model_descr, prompt, generation_config): + model_id, path, tokenizer, model, stateful = read_model(( + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + Path("TinyLlama-1.1B-Chat-v1.0") + )) + config = ov_genai.GenerationConfig() + config.max_new_tokens = 100 + cb = get_continuous_batching(path) + generated = cb.generate(prompt, **generation_config) + reference = stateful.generate(prompt, **generation_config) + assert generated.texts == reference.texts + if 1 != generation_config.get("num_return_sequences", 1): + # Stateful puts zeroes to generated.scores. Don't compare them. + for gen, ref in zip(generated.scores, reference.scores): + assert math.isclose(gen, ref, abs_tol=0.0003) + +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.skip("continuous_batching seg faults with nightly ov. Ticket 147793") +def test_cb_streamer_vs_return_vs_stateful(model_descr, prompt): + model_id, path, tokenizer, model, stateful = read_model(( + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + Path("TinyLlama-1.1B-Chat-v1.0") + )) + cb = get_continuous_batching(path) + streamed = [] + generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) + reference = stateful.generate(prompt, max_new_tokens=20) + assert generated == "".join(streamed) + assert "".join(streamed) == reference diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/tokenizer_configs.py index 4caf031463..d8a21946cc 100644 --- a/tests/python_tests/tokenizer_configs.py +++ b/tests/python_tests/tokenizer_configs.py @@ -995,5 +995,11 @@ def get_tokenizer_configs(): "unk_token": "", "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" }, + "mistralai/Mistral-7B-Instruct-v0.1": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n" + } } -