diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 527259f203..79af10734d 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -13,9 +13,9 @@ concurrency: cancel-in-progress: true env: - l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz - m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz - w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip + l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240719_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip jobs: cpp-multinomial-greedy_causal_lm-ubuntu: runs-on: ubuntu-20.04-8-cores diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml index 3df73d437b..b59f5f1eb3 100644 --- a/.github/workflows/genai_package.yml +++ b/.github/workflows/genai_package.yml @@ -5,9 +5,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }} cancel-in-progress: true env: - l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz - m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz - w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip + l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240719_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip jobs: ubuntu_genai_package: strategy: diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml index 257a9c2f57..cf5ce91f01 100644 --- a/.github/workflows/genai_python_lib.yml +++ b/.github/workflows/genai_python_lib.yml @@ -5,9 +5,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }} cancel-in-progress: true env: - l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240711_x86_64.tgz - m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz - w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip + l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240719_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip jobs: ubuntu_genai_python_lib: # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env. diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt index ed80a66deb..ab0b16db41 100644 --- a/llm_bench/python/requirements.txt +++ b/llm_bench/python/requirements.txt @@ -1,17 +1,18 @@ --extra-index-url https://download.pytorch.org/whl/cpu numpy --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly -openvino -openvino-tokenizers -openvino_genai +openvino~=2024.3.0 +openvino-tokenizers~=2024.3.0 +openvino_genai~=2024.3.0 auto-gptq>=0.5.1 # for gptq pillow -torch -transformers>=4.40.0 +torch<2.5.0 +torchvision<0.20.0 +transformers>=4.40.0,<4.43.0 diffusers>=0.22.0 -#optimum is in dependency list of optimum-intel -git+https://github.com/huggingface/optimum-intel.git@439d61f79cf55d5d0b28334f577b6ac3c5ced28f#egg=optimum-intel -git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf +#optimum is in dependency list of optimum-intel +git+https://github.com/huggingface/optimum-intel.git@6388aeb8738b63e28fc594af84df94590e77cb9a#egg=optimum-intel +nncf~=2.12.0 packaging psutil timm diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md index 0778868e6a..351773ec0d 100644 --- a/samples/python/multinomial_causal_lm/README.md +++ b/samples/python/multinomial_causal_lm/README.md @@ -2,6 +2,8 @@ This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. +This sample also contains example implementation of an iterable streamer with bufferisation. + ## Download and convert the model and tokenizers The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. @@ -22,6 +24,12 @@ Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. +## Streaming + +This Python example demonstrates custom detokenization with bufferization. The streamer receives integer tokens corresponding to each word or subword, one by one. If tokens are decoded individually, the resulting text misses necessary spaces because of detokenize(tokenize(" a")) == "a". + +To address this, the detokenizer needs a larger context. We accumulate tokens in a tokens_cache buffer and decode multiple tokens together, adding the text to the streaming queue only when a complete decoded chunk is ready. We run a separate thread to print all new elements arriving in this queue from the generation pipeline. Each generated chunk of text is put into a synchronized queue, ensuring that all put and get operations are thread-safe and blocked until they can proceed. + ### Troubleshooting #### Unicode characters encoding error on Windows diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py index 6feb8a2b85..6300320264 100755 --- a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py +++ b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py @@ -4,11 +4,120 @@ import argparse import openvino_genai +import queue +import threading -def streamer(subword): - print(subword, end='', flush=True) - return False +class IterableStreamer(openvino_genai.StreamerBase): + """ + A custom streamer class for handling token streaming and detokenization with buffering. + + Attributes: + tokenizer (Tokenizer): The tokenizer used for encoding and decoding tokens. + tokens_cache (list): A buffer to accumulate tokens for detokenization. + text_queue (Queue): A synchronized queue for storing decoded text chunks. + print_len (int): The length of the printed text to manage incremental decoding. + """ + + def __init__(self, tokenizer): + """ + Initializes the IterableStreamer with the given tokenizer. + + Args: + tokenizer (Tokenizer): The tokenizer to use for encoding and decoding tokens. + """ + super().__init__() + self.tokenizer = tokenizer + self.tokens_cache = [] + self.text_queue = queue.Queue() + self.print_len = 0 + + def __iter__(self): + """ + Returns the iterator object itself. + """ + return self + + def __next__(self): + """ + Returns the next value from the text queue. + + Returns: + str: The next decoded text chunk. + + Raises: + StopIteration: If there are no more elements in the queue. + """ + value = self.text_queue.get() # get() will be blocked until a token is available. + if value is None: + raise StopIteration + return value + + def get_stop_flag(self): + """ + Checks whether the generation process should be stopped. + + Returns: + bool: Always returns False in this implementation. + """ + return False + + def put_word(self, word: str): + """ + Puts a word into the text queue. + + Args: + word (str): The word to put into the queue. + """ + self.text_queue.put(word) + + def put(self, token_id: int) -> bool: + """ + Processes a token and manages the decoding buffer. Adds decoded text to the queue. + + Args: + token_id (int): The token_id to process. + + Returns: + bool: True if generation should be stopped, False otherwise. + """ + self.tokens_cache.append(token_id) + text = self.tokenizer.decode(self.tokens_cache) + + word = '' + if len(text) > self.print_len and '\n' == text[-1]: + # Flush the cache after the new line symbol. + word = text[self.print_len:] + self.tokens_cache = [] + self.print_len = 0 + elif len(text) >= 3 and text[-3:] == chr(65533): + # Don't print incomplete text. + pass + elif len(text) > self.print_len: + # It is possible to have a shorter text after adding new token. + # Print to output only if text lengh is increaesed. + word = text[self.print_len:] + self.print_len = len(text) + self.put_word(word) + + if self.get_stop_flag(): + # When generation is stopped from streamer then end is not called, need to call it here manually. + self.end() + return True # True means stop generation + else: + return False # False means continue generation + + def end(self): + """ + Flushes residual tokens from the buffer and puts a None value in the queue to signal the end. + """ + text = self.tokenizer.decode(self.tokens_cache) + if len(text) > self.print_len: + word = text[self.print_len:] + self.put_word(word) + self.tokens_cache = [] + self.print_len = 0 + self.put_word(None) def main(): @@ -19,17 +128,25 @@ def main(): device = 'CPU' # GPU can be used as well pipe = openvino_genai.LLMPipeline(args.model_dir, device) - + + text_print_streamer = IterableStreamer(pipe.get_tokenizer()) + def token_printer(): + # Getting next elements from iterable will be blocked until a new token is available. + for word in text_print_streamer: + print(word, end='', flush=True) + printer_thread = threading.Thread(target=token_printer, daemon=True) + printer_thread.start() + config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 config.do_sample = True config.top_p = 0.9 config.top_k = 30 - # Since the streamer is set, the results will - # be printed each time a new token is generated. - pipe.generate(args.prompt, config, streamer) - + # Since the streamer is set, the results will be printed + # every time a new token is generated and put into the streamer queue. + pipe.generate(args.prompt, config, text_print_streamer) + printer_thread.join() if '__main__' == __name__: main() diff --git a/src/README.md b/src/README.md index e88c2f784f..33421619d5 100644 --- a/src/README.md +++ b/src/README.md @@ -5,10 +5,24 @@ It hides the complexity of the generation process and minimizes the amount of co ## Install OpenVINO™ GenAI +> **NOTE**: Please make sure that you are following the versions compatibility rules, refer to the [OpenVINO™ GenAI Dependencies](#openvino-genai-dependencies) for more information. + The OpenVINO™ GenAI flavor is available for installation via Archive and PyPI distributions. To install OpenVINO™ GenAI, refer to the [Install Guide](https://docs.openvino.ai/2024/get-started/install-openvino.html). -To build OpenVINO™ GenAI library from source, refer to the [Build Instructions](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/BUILD.md). +To build OpenVINO™ GenAI library from source, refer to the [Build Instructions](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/src/docs/BUILD.md). + +### OpenVINO™ GenAI Dependencies + +OpenVINO™ GenAI depends on [OpenVINO](https://github.com/openvinotoolkit/openvino) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). + +When installing OpenVINO™ GenAI from PyPi, the same versions of OpenVINO and OpenVINO Tokenizers are used (e.g. `openvino==2024.3.0` and `openvino-tokenizers==2024.3.0.0` are installed for `openvino-genai==2024.3.0`). +If you update one of the dependency packages (e.g. install `openvino-nightly`), versions might be incompatible due to different ABI and running OpenVINO GenAI can result in errors (e.g. `ImportError: libopenvino.so.2440: cannot open shared object file: No such file or directory`). +Having packages version in format `...`, only `` part of the full version can be varied to ensure ABI compatibility, while changing ``, `` or `` parts of the version might break ABI. + +GenAI, Tokenizers, and OpenVINO wheels for Linux on PyPI are compiled with `_GLIBCXX_USE_CXX11_ABI=0` to cover a wider range of platforms. In contrast, C++ archive distributions for Ubuntu are compiled with `_GLIBCXX_USE_CXX11_ABI=1`. It is not possible to mix different Application Binary Interfaces (ABIs) because doing so results in a link error. This incompatibility prevents the use of, for example, OpenVINO from C++ archive distributions alongside GenAI from PyPI. + +If you want to try OpenVINO GenAI with different dependencies versions (**not** prebuilt packages as archives or python wheels), build OpenVINO GenAI library from source. ## Usage @@ -16,16 +30,16 @@ To build OpenVINO™ GenAI library from source, refer to the [Build Instructions 1. Installed OpenVINO™ GenAI - > If OpenVINO GenAI is installed via archive distribution or built from source, you will need to install additional python dependencies (e.g. `optimum-cli` for simplified model downloading and exporting, it's not required to install [./samples/requirements.txt](./samples/requirements.txt) for deployment if the model has already been exported): - > - > ```sh - > # (Optional) Clone OpenVINO GenAI repository if it does not exist - > git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git - > cd openvino.genai - > # Install python dependencies - > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - > python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt - > ``` + > To use OpenVINO GenAI with models that are already in OpenVINO format, no additional python dependencies are needed. To + > convert models with optimum-cli and to run the examples, install the dependencies in [./samples/requirements.txt](./samples/requirements.txt): + ```sh + # (Optional) Clone OpenVINO GenAI repository if it does not exist + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai + # Install python dependencies + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt + ``` 2. A model in OpenVINO IR format @@ -164,6 +178,8 @@ int main(int argc, char* argv[]) { ``` Streaming with a custom class: + +C++ template for a stremer. ```cpp #include "openvino/genai/streamer_base.hpp" #include "openvino/genai/llm_pipeline.hpp" @@ -172,18 +188,14 @@ Streaming with a custom class: class CustomStreamer: public ov::genai::StreamerBase { public: bool put(int64_t token) { - bool stop_flag = false; - /* - custom decoding/tokens processing code - tokens_cache.push_back(token); - std::string text = m_tokenizer.decode(tokens_cache); - ... - */ - return stop_flag; // flag whether generation should be stoped, if true generation stops. + // Custom decoding/tokens processing logic. + + // Returns a flag whether generation should be stoped, if true generation stops. + return false; }; void end() { - /* custom finalization */ + // Custom finalization logic. }; }; @@ -192,10 +204,35 @@ int main(int argc, char* argv[]) { std::string model_path = argv[1]; ov::genai::LLMPipeline pipe(model_path, "CPU"); - std::cout << pipe.generate("The Sun is yellow because", ov::genai::streamer(custom_streamer), ov::genai::max_new_tokens(200)); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(15), ov::genai::streamer(custom_streamer)); } ``` +Python template for a streamer. +```py +import openvino_genai as ov_genai + +class CustomStreamer(ov_genai.StreamerBase): + def __init__(self): + super().__init__() + # Initialization logic. + + def put(self, token_id) -> bool: + # Custom decoding/tokens processing logic. + + # Returns a flag whether generation should be stoped, if true generation stops. + return False + + def end(self): + # Custom finalization logic. + +pipe = ov_genai.LLMPipeline(model_path, "CPU") +custom_streamer = CustomStreamer() + +pipe.generate("The Sun is yellow because", max_new_tokens=15, streamer=custom_streamer) +``` +For fully implemented iterable CustomStreamer please refer to [multinomial_causal_lm](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/python/multinomial_causal_lm/README.md) sample. + ### Performance Metrics `openvino_genai.PerfMetrics` (referred as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` holds fields with mean and standard deviations for the following metrics: @@ -289,8 +326,8 @@ For more examples of how metrics are used, please refer to the Python [benchmark ## How It Works -For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/HOW_IT_WORKS.md). +For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/src/docs/HOW_IT_WORKS.md). ## Supported Models -For a list of supported models, refer to the [Supported Models Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/SUPPORTED_MODELS.md). +For a list of supported models, refer to the [Supported Models Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/src/docs/SUPPORTED_MODELS.md). diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index ddb9ff581f..ad53d8d941 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -16,8 +16,18 @@ using TimePoint = std::chrono::steady_clock::time_point; using MicroSeconds = std::chrono::duration>; /** -* @brief Structure with raw performance metrics for each generation before any statistics calculated. -*/ + * @brief Structure with raw performance metrics for each generation before any statistics are calculated. + * + * @param generate_durations Durations for each generate call in microseconds. + * @param tokenization_durations Durations for the tokenization process in microseconds. + * @param detokenization_durations Durations for the detokenization process in microseconds. + * @param m_times_to_first_token Times to the first token for each call in microseconds. + * @param m_new_token_times Time points for each new token generated. + * @param m_batch_sizes Batch sizes for each generate call. + * @param m_durations Total durations for each generate call in microseconds. + * @param num_generated_tokens Total number of tokens generated. + * @param num_input_tokens Total number of tokens in the input prompt. + */ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { std::vector generate_durations; std::vector tokenization_durations; @@ -41,10 +51,52 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair { }; /** -* @brief Structure to store performance metric for each generation. -* -* @param -*/ + * @brief Holds performance metrics for each generate call. + * + * PerfMetrics holds fields with mean and standard deviations for the following metrics: + * - Time To the First Token (TTFT), ms + * - Time per Output Token (TPOT), ms/token + * - Generate total duration, ms + * - Tokenization duration, ms + * - Detokenization duration, ms + * - Throughput, tokens/s + * + * Additional fields include: + * - Load time, ms + * - Number of generated tokens + * - Number of tokens in the input prompt + * + * Preverable way to access values is via get functions. Getters calculate mean and std values from raw_metrics are return pairs. + * If mean and std were already calcualted getters return cached values. + * @param get_load_time Returns the load time in milliseconds. + * @param get_num_generated_tokens Returns the number of generated tokens. + * @param get_num_input_tokens Returns the number of tokens in the input prompt. + * @param get_ttft Returns the mean and standard deviation of TTFT. + * @param get_tpot Returns the mean and standard deviation of TPOT. + * @param get_throughput Returns the mean and standard deviation of throughput. + * @param get_generate_duration Returns the mean and standard deviation of generate duration. + * @param get_tokenization_duration Returns the mean and standard deviation of tokenization duration. + * @param get_detokenization_duration Returns the mean and standard deviation of detokenization duration. + * @param get_microsec Converts a duration to microseconds. + * @param m_evaluated Flag indicating if raw metrics were evaluated. + * If false, current mean/std TTFT, TPOT, etc. are not actual and evaluate_statistics() should recalculate them. + * @param evaluate_statistics Calculates mean and standard deviation values from raw_metrics. + * Optional start_time can be provided to update durations. + * @param operator+ Adds two PerfMetrics objects. + * @param operator+= Adds and assigns the right-hand PerfMetrics to the current object. + * @param raw_metrics A structure of RawPerfMetrics type that holds raw metrics. + * @param load_time Load time in milliseconds. + * + * Cached mean and standard deviations. + * @param ttft Mean and standard deviation of Time to the First Token (TTFT) in milliseconds. + * @param tpot Mean and standard deviation of Time per Output Token (TPOT) in milliseconds per token. + * @param throughput Mean and standard deviation of tokens per second. + * @param generate_duration Mean and standard deviation of the total duration of generate calls in milliseconds. + * @param tokenization_duration Mean and standard deviation of the tokenization duration in milliseconds. + * @param detokenization_duration Mean and standard deviation of the detokenization duration in milliseconds. + * @param num_generated_tokens Number of generated tokens. + * @param num_input_tokens Number of tokens in the input prompt. + */ struct OPENVINO_GENAI_EXPORTS PerfMetrics { float load_time; // Load time in ms. MeanStdPair ttft; // Time to the first token (in ms) (TTTFT). diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index c4ff0a90ab..af3962a0a8 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -108,8 +108,7 @@ void copy_with_offset(const ov::Tensor& orig, const int32_t offset, ov::Tensor& ov::AnyMap extract_config_or_default(const ov::AnyMap& config, const std::string& config_name) { ov::AnyMap stage_cfg; if (auto it = config.find(config_name); it != config.end()) { - const auto& map = it->second.as>(); - stage_cfg = { map.begin(), map.end() }; + stage_cfg = it->second.as(); } else if (config_name == "PREFILL_CONFIG") { std::map prefill_config = { { "NPU_USE_NPUW", "YES" }, diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md index 3b89995dc2..81cff10b48 100644 --- a/src/docs/BUILD.md +++ b/src/docs/BUILD.md @@ -3,85 +3,203 @@ > **NOTE**: There is a known Python API issue with `ov::Tensor`. The issue is reproduced when building OpenVINO GenAI from sources while using OpenVINO from archives. Using `ov::Tensor` with OpenVINO GenAI fails. Possible errors: `TypeError: generate(): incompatible function arguments.`, `TypeError: __init__(): incompatible constructor arguments.`, `TypeError: Unregistered type : ov::Tensor`. The preferred approach is to build both OpenVINO and OpenVINO GenAI from sources using the same build environment. Or to install prebuilt OpenVINO GenAI from [distribution channels](https://docs.openvino.ai/2024/get-started/install-openvino.html). -## Build for Linux Systems +## Software Requirements -### Software Requirements +### Linux - [CMake](https://cmake.org/download/) 3.23 or higher - GCC 7.5 or higher - Python 3.8 or higher +- Git -### Build Instructions +### Windows + +- [CMake](https://cmake.org/download/) 3.23 or higher +- Microsoft Visual Studio 2019 or higher, version 16.3 or later +- Python 3.8 or higher +- Git for Windows +- [NSIS](https://sourceforge.net/projects/nsis/) + +### macOS + +- [CMake](https://cmake.org/download/) 3.23 or higher +- [brew](https://brew.sh/) package manager to install additional dependencies: + ```sh + brew install coreutils scons + ``` +- Clang compiler and other command line tools from Xcode 10.1 or higher: + ```sh + xcode-select --install + ``` +- Python 3.8 or higher +- Git + + +## Build Instructions + +### Build OpenVINO, OpenVINO Tokenizers, and OpenVINO GenAI From Source 1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build). -The path to the openvino install directory is referred as throughout the document. +The path to the OpenVINO install directory is referred as `` throughout the document. 2. Clone OpenVINO GenAI repository and init submodules: ```sh git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git cd openvino.genai ``` -3. Build the project: +3. Set up the environment: + + #### Option 1 - using OpenVINO `setupvars` script: + + Linux and macOS: ```sh source /setupvars.sh - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --target package -j - cmake --install ./build/ --config Release --prefix ov ``` -## Build for Windows Systems + Windows Command Prompt: + ```cmd + call \setupvars.bat + ``` -### Software Requirements + Windows PowerShell: + ```cmd + . /setupvars.ps1 + ``` -- [CMake](https://cmake.org/download/) 3.23 or higher -- Microsoft Visual Studio 2019 or higher, version 16.3 or later -- Python 3.8 or higher -- Git for Windows + #### Option 2 - setting environment variables manually: -### Build Instructions + Linux: + ```sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` -1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build) -The path to the openvino install directory is referred as throughout the document. -2. Clone OpenVINO GenAI repository and init submodules: + macOS: ```sh - git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git - cd openvino.genai + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export DYLD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` + + Windows Command Prompt: + ```cmd + set OpenVINO_DIR=\runtime + set PYTHONPATH=\python;%CD%\build;%PYTHONPATH% + set OPENVINO_LIB_PATHS=\bin\intel64\Release;%OPENVINO_LIB_PATHS% + set PATH=%OPENVINO_LIB_PATHS%;%PATH% ``` -3. Build the project: + + Windows PowerShell: + ```sh + $env:OpenVINO_DIR = "\runtime" + $env:PYTHONPATH = "\python;$PWD\build;$env:PYTHONPATH" + $env:OPENVINO_LIB_PATHS = "\bin\intel64\Release;$env:OPENVINO_LIB_PATHS" + $env:PATH = "$env:OPENVINO_LIB_PATHS;$env:PATH" + ``` + +4. Build the project: ```sh - call \setupvars.bat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --target package -j - cmake --install ./build/ --config Release --prefix ov + cmake --build ./build/ --config Release -j ``` -## Build for macOS Systems +5. Install OpenVINO GenAI: -### Software Requirements + #### Option 1 - using cmake: + + The following command will store built OpenVINO GenAI artifacts along with OpenVINO in ``: -- [CMake](https://cmake.org/download/) 3.23 or higher -- [brew](https://brew.sh/) package manager to install additional dependencies: ```sh - brew install coreutils scons + cmake --install ./build/ --config Release --prefix ``` -- Clang compiler and other command line tools from Xcode 10.1 or higher: + + #### Option 2 - setting paths to built OpenVINO GenAI artifacts manually: + + The path to the OpenVINO GenAI root directory is referred as `` throughout the document. + + Linux: ```sh - xcode-select --install + export PYTHONPATH=/build/:$PYTHONPATH + export LD_LIBRARY_PATH=/build/openvino_genai/:$LD_LIBRARY_PATH ``` -- Python 3.8 or higher -### Build Instructions + macOS: + ```sh + export PYTHONPATH=/build:$PYTHONPATH + export DYLD_LIBRARY_PATH=/build/openvino_genai:$DYLD_LIBRARY_PATH + ``` -1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build) -The path to the openvino install directory is referred as throughout the document. -2. Clone OpenVINO GenAI repository and init submodules: + Windows Command Prompt: + ```cmd + set PYTHONPATH=\build;%PYTHONPATH% + set PATH=\build\openvino_genai;%PATH% + ``` + + Windows PowerShell: + ```sh + $env:PYTHONPATH = "\build;$env:PYTHONPATH" + $env:PATH = "\build\openvino_genai;$env:PATH" + ``` + +To optimize the package size, you can reduce the ICU (International Components for Unicode) data size when OpenVINO Tokenizers are built as a submodule of OpenVINO GenAI. +For more information please refer to the [OpenVINO Tokenizers instructions](https://github.com/openvinotoolkit/openvino_tokenizers?tab=readme-ov-file#reducing-the-icu-data-size). + + +### Build OpenVINO GenAI Wheel + +1. Clone OpenVINO GenAI repository and init submodules: ```sh git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git cd openvino.genai ``` -3. Build the project: +2. Set up the environment: + - Option 1 - using OpenVINO `setupvars.sh` script: + ```sh + source /setupvars.sh + ``` + - Option 2 - setting environment variables manually: + ```sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` +3. Upgrade pip to ensure you have the latest version: ```sh - source /setupvars.sh - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --target package -j - cmake --install ./build/ --config Release --prefix ov + python -m pip install --upgrade pip + ``` +4. Build the wheel in the `dist` directory: + ```sh + python -m pip wheel . -w dist/ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + ``` + +### Install OpenVINO GenAI From Source + +1. Clone OpenVINO GenAI repository and init submodules: + ```sh + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai + ``` +2. Set up the environment: + - Option 1 - using OpenVINO `setupvars.sh` script: + ```sh + source /setupvars.sh + ``` + - Option 2 - setting environment variables manually: + ```sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` +3. Upgrade pip to ensure you have the latest version: + ```sh + python -m pip install --upgrade pip + ``` +4. Install the package directly from source: + ```sh + python -m pip install . + ``` +5. To verify the installation, run a simple Python script: + ```python + import openvino_genai + print(openvino_genai.__version__) ``` diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 031c8fb97b..9518e1ece4 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -102,6 +102,86 @@ auto generation_config_docstring = R"( repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. )"; +auto raw_perf_metrics_docstring = R"( + Structure with raw performance metrics for each generation before any statistics are calculated. + + :param generate_durations: Durations for each generate call in microseconds. + :type generate_durations: List[MicroSeconds] + + :param tokenization_durations: Durations for the tokenization process in microseconds. + :type tokenization_durations: List[MicroSeconds] + + :param detokenization_durations: Durations for the detokenization process in microseconds. + :type detokenization_durations: List[MicroSeconds] + + :param m_times_to_first_token: Times to the first token for each call in microseconds. + :type m_times_to_first_token: List[MicroSeconds] + + :param m_new_token_times: Time points for each new token generated. + :type m_new_token_times: List[TimePoint] + + :param m_batch_sizes: Batch sizes for each generate call. + :type m_batch_sizes: List[int] + + :param m_durations: Total durations for each generate call in microseconds. + :type m_durations: List[MicroSeconds] + + :param num_generated_tokens: Total number of tokens generated. + :type num_generated_tokens: int + + :param num_input_tokens: Total number of tokens in the input prompt. + :type num_input_tokens: int +)"; + +auto perf_metrics_docstring = R"( + Holds performance metrics for each generate call. + + PerfMetrics holds fields with mean and standard deviations for the following metrics: + - Time To the First Token (TTFT), ms + - Time per Output Token (TPOT), ms/token + - Generate total duration, ms + - Tokenization duration, ms + - Detokenization duration, ms + - Throughput, tokens/s + + Additional fields include: + - Load time, ms + - Number of generated tokens + - Number of tokens in the input prompt + + Preferable way to access values is via get functions. Getters calculate mean and std values from raw_metrics and return pairs. + If mean and std were already calculated, getters return cached values. + + :param get_load_time: Returns the load time in milliseconds. + :type get_load_time: float + + :param get_num_generated_tokens: Returns the number of generated tokens. + :type get_num_generated_tokens: int + + :param get_num_input_tokens: Returns the number of tokens in the input prompt. + :type get_num_input_tokens: int + + :param get_ttft: Returns the mean and standard deviation of TTFT. + :type get_ttft: MeanStdPair + + :param get_tpot: Returns the mean and standard deviation of TPOT. + :type get_tpot: MeanStdPair + + :param get_throughput: Returns the mean and standard deviation of throughput. + :type get_throughput: MeanStdPair + + :param get_generate_duration: Returns the mean and standard deviation of generate duration. + :type get_generate_duration: MeanStdPair + + :param get_tokenization_duration: Returns the mean and standard deviation of tokenization duration. + :type get_tokenization_duration: MeanStdPair + + :param get_detokenization_duration: Returns the mean and standard deviation of detokenization duration. + :type get_detokenization_duration: MeanStdPair + + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. + :type raw_metrics: RawPerfMetrics +)"; OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& config, const py::kwargs& kwargs) { if(!config.has_value() && kwargs.empty()) @@ -580,7 +660,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readonly("perf_metrics", &DecodedResults::perf_metrics) .def("__str__", &DecodedResults::operator std::string); - py::class_(m, "RawPerfMetrics") + py::class_(m, "RawPerfMetrics", raw_perf_metrics_docstring) .def(py::init<>()) .def_readonly("generate_durations", &RawPerfMetrics::generate_durations) .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { @@ -604,7 +684,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readonly("mean", &MeanStdPair::mean) .def_readonly("std", &MeanStdPair::std); - py::class_(m, "PerfMetrics") + py::class_(m, "PerfMetrics", perf_metrics_docstring) .def(py::init<>()) .def("get_generate_duration", &PerfMetrics::get_generate_duration) .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration)