diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 1d1486c385..17dffff526 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -63,13 +63,13 @@ jobs: PYTHONPATH: "./build" - run: > . ./ov/setupvars.sh - && timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" - | diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") - + && timeout 25s ./build/samples/cpp/text_generation/greedy_causal_lm ./open_llama_3b_v2/ "return 0" + | diff <(timeout 25s samples/python/text_generation/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") - env: PYTHONPATH: "./build" - run: > . ./ov/setupvars.sh - && samples/python/greedy_causal_lm/lora.py ./TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?" + && samples/python/text_generation/lora.py ./TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?" env: PYTHONPATH: "./build" @@ -249,7 +249,7 @@ jobs: - run: > set PATH=.\build\openvino_genai\;%PATH% && call .\ov\setupvars.bat - && .\build\samples\cpp\greedy_causal_lm\Release\greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\cpp.txt + && .\build\samples\cpp\text_generation\Release\greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\cpp.txt - run: | echo import transformers > ref.py echo predictions = open('cpp.txt', 'r').read() >> ref.py @@ -266,13 +266,13 @@ jobs: set PATH=.\build\openvino_genai\;%PATH% && set "PYTHONPATH=./build/" && call .\ov\setupvars.bat - && python samples\python\greedy_causal_lm\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt + && python samples\python\text_generation\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt - run: fc .\cpp.txt .\py.txt - run: > set PATH=.\build\openvino_genai\;%PATH% && set "PYTHONPATH=./build/" && call .\ov\setupvars.bat - && python samples\python\greedy_causal_lm\lora.py .\TinyLlama\TinyLlama-1.1B-intermediate-step-1431k-3T\ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?" + && python samples\python\text_generation\lora.py .\TinyLlama\TinyLlama-1.1B-intermediate-step-1431k-3T\ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?" cpp-greedy_causal_lm-Qwen-7B-Chat: runs-on: ubuntu-20.04-16-cores @@ -304,7 +304,7 @@ jobs: optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat - run: > . ./ov/setupvars.sh - && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) - + && timeout 2m ./build/samples/cpp/text_generation/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/text_generation/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) - env: PYTHONPATH: "./build" @@ -446,7 +446,7 @@ jobs: run: | source ./ov/setupvars.sh ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt - ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt + ./build/samples/cpp/text_generation/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt python ./samples/python/speculative_decoding_lm/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt python -c " with open('predictions_greedy.txt', 'r') as f: @@ -504,7 +504,7 @@ jobs: A:' > ./prompt.txt ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt - ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt + ./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -525,7 +525,7 @@ jobs: A:' > ./prompt.txt ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./Qwen-7B-Chat/ "$( predictions_prompt_lookup.txt - ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ "$( predictions_greedy.txt + ./build/samples/cpp/text_generation/greedy_causal_lm ./Qwen-7B-Chat/ "$( predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -566,7 +566,7 @@ jobs: - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/samples/cpp/text_generation/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " @@ -585,7 +585,7 @@ jobs: echo Phi-1_5 passed - run: > . ./ov/setupvars.sh - && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a" + && timeout 50s samples/python/text_generation/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a" | diff ./pred_greedy.txt - env: PYTHONPATH: "./build" @@ -621,7 +621,7 @@ jobs: - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/samples/cpp/text_generation/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " @@ -640,7 +640,7 @@ jobs: echo "Alan Turing was a" passed - run: > . ./ov/setupvars.sh - && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a" + && timeout 50s samples/python/text_generation/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a" | diff ./pred_greedy.txt - env: PYTHONPATH: "./build" diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 860ced072b..5329510e05 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory(cpp/beam_search_causal_lm) add_subdirectory(cpp/benchmark_genai) add_subdirectory(cpp/chat_sample) -add_subdirectory(cpp/greedy_causal_lm) +add_subdirectory(cpp/text_generation) add_subdirectory(cpp/lora_greedy_causal_lm) add_subdirectory(cpp/multinomial_causal_lm) add_subdirectory(cpp/prompt_lookup_decoding_lm) @@ -25,7 +25,7 @@ install(DIRECTORY cpp/beam_search_causal_lm cpp/benchmark_genai cpp/chat_sample - cpp/greedy_causal_lm + cpp/text_generation cpp/lora_greedy_causal_lm cpp/multinomial_causal_lm # Don't install prompt_lookup_decoding_lm because it doesn't use openvino_genai library and is not verified yet. @@ -39,7 +39,7 @@ install(DIRECTORY python/beam_search_causal_lm python/benchmark_genai python/chat_sample - python/greedy_causal_lm + python/text_generation python/multinomial_causal_lm python/speculative_decoding_lm python/text2image diff --git a/samples/cpp/greedy_causal_lm/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt similarity index 58% rename from samples/cpp/greedy_causal_lm/CMakeLists.txt rename to samples/cpp/text_generation/CMakeLists.txt index ff5151676f..377682974e 100644 --- a/samples/cpp/greedy_causal_lm/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -20,3 +20,16 @@ install(TARGETS greedy_causal_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) + +add_executable(encrypted_model_causal_lm encrypted_model_causal_lm.cpp) +target_link_libraries(encrypted_model_causal_lm PRIVATE openvino::genai) +set_target_properties(encrypted_model_causal_lm PROPERTIES + COMPILE_PDB_NAME encrypted_model_causal_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(encrypted_model_causal_lm PRIVATE cxx_std_11) + +install(TARGETS encrypted_model_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/text_generation/README.md similarity index 79% rename from samples/cpp/greedy_causal_lm/README.md rename to samples/cpp/text_generation/README.md index 2f3a7751bf..6928d03927 100644 --- a/samples/cpp/greedy_causal_lm/README.md +++ b/samples/cpp/text_generation/README.md @@ -24,6 +24,18 @@ Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. +## Using encrypted models + +LLMPipeline and Tokenizer objects can be initialized directly from the memory buffer, e.g. when user stores only encrypted files and decrypts them on-the-fly. +The following code snippet demonstrates how to load the model from the memory buffer: + +```cpp +auto [model_str, weights_tensor] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin"); +ov::genai::Tokenizer tokenizer(models_path); +ov::genai::LLMPipeline pipe(model_str, weights_tensor, tokenizer, device); +``` +For the sake of brevity the code above does not include Tokenizer decryption. For more details look to encrypted_model_causal_lm sample. + ### Troubleshooting #### Unicode characters encoding error on Windows diff --git a/samples/cpp/text_generation/encrypted_model_causal_lm.cpp b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp new file mode 100644 index 0000000000..3ea94d605f --- /dev/null +++ b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp @@ -0,0 +1,59 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" +#include + +std::pair decrypt_model(const std::string& model_path, const std::string& weights_path) { + std::ifstream model_file(model_path); + std::ifstream weights_file(weights_path, std::ios::binary); + if (!model_file.is_open() || !weights_file.is_open()) { + throw std::runtime_error("Cannot open model or weights file"); + } + + // User can add file decryption of model_file and weights_file in memory here. + + std::string model_str((std::istreambuf_iterator(model_file)), std::istreambuf_iterator()); + std::vector weights_buffer((std::istreambuf_iterator(weights_file)), std::istreambuf_iterator()); + auto weights_tensor = ov::Tensor(ov::element::u8, {weights_buffer.size()}, weights_buffer.data()); + return {model_str, weights_tensor}; +} + +ov::genai::Tokenizer decrypt_tokenizer(const std::string& models_path) { + std::string tok_model_path = models_path + "/openvino_tokenizer.xml"; + std::string tok_weights_path = models_path + "/openvino_tokenizer.bin"; + auto [tok_model_str, tok_weights_tensor] = decrypt_model(tok_model_path, tok_weights_path); + + std::string detok_model_path = models_path + "/openvino_detokenizer.xml"; + std::string detok_weights_path = models_path + "/openvino_detokenizer.bin"; + auto [detok_model_str, detok_weights_tensor] = decrypt_model(tok_model_path, tok_weights_path); + + return ov::genai::Tokenizer(tok_model_str, tok_weights_tensor, detok_model_str, detok_weights_tensor); +} + +int main(int argc, char* argv[]) try { + if (3 > argc) + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); + + std::string device = "CPU"; // GPU, NPU can be used as well + std::string models_path = argv[1]; + std::string prompt = argv[2]; + + auto [model_str, model_weights] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin"); + ov::genai::Tokenizer tokenizer = decrypt_tokenizer(models_path); + + ov::genai::LLMPipeline pipe(model_str, model_weights, tokenizer, device); + + std::string result = pipe.generate(prompt, ov::genai::max_new_tokens(100)); + std::cout << result << std::endl; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp similarity index 100% rename from samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp rename to samples/cpp/text_generation/greedy_causal_lm.cpp diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index 3a655374e9..e426965e66 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -18,7 +18,7 @@ int main(int argc, char* argv[]) try { std::string device = "CPU"; // GPU can be used as well ov::AnyMap enable_compile_cache; - if ("GPU" == device) { + if (device == "GPU") { // Cache compiled models on disk for GPU to save time on the // next run. It's not beneficial for CPU. enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/text_generation/README.md similarity index 100% rename from samples/python/greedy_causal_lm/README.md rename to samples/python/text_generation/README.md diff --git a/samples/python/greedy_causal_lm/greedy_causal_lm.py b/samples/python/text_generation/greedy_causal_lm.py similarity index 100% rename from samples/python/greedy_causal_lm/greedy_causal_lm.py rename to samples/python/text_generation/greedy_causal_lm.py diff --git a/samples/python/greedy_causal_lm/lora.py b/samples/python/text_generation/lora.py similarity index 100% rename from samples/python/greedy_causal_lm/lora.py rename to samples/python/text_generation/lora.py diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 2bf5dd773b..4a0637f2d9 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -88,6 +88,32 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { const ov::AnyMap& properties = {} ); + /** + * @brief Constructs a ContinuousBatchingPipeline from already existing model and tokenizer. + * + * This constructor allows for the creation of a ContinuousBatchingPipeline using an existing model + * represented as a string and a weights tensor, along with a manually initialized tokenizer. + * This is useful when the model and tokenizer are already loaded or created in memory and do not + * need to be loaded from files. + * + * @param model_str A string representation of the model. + * @param weights_tensor A tensor containing the weights of the model. + * @param tokenizer A manually initialized ov::genai::Tokenizer. + * @param scheduler_config Configuration for the scheduler. + * @param device The device to run the pipeline on (e.g., CPU, GPU). + * @param properties Optional properties for the pipeline. + * @param generation_config Optional generation configuration for the pipeline. + */ + ContinuousBatchingPipeline( + const std::string& model_str, + const ov::Tensor& weights_tensor, + const ov::genai::Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& properties = {}, + const ov::genai::GenerationConfig& generation_config = {} + ); + ov::genai::Tokenizer get_tokenizer(); ov::genai::GenerationConfig get_config() const; diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 3d62535120..44427d45b1 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -112,6 +112,15 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { const ov::AnyMap& properties = {} ); + LLMPipeline( + const std::string& model_str, + const ov::Tensor& weights_tensor, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& properties = {}, + const ov::genai::GenerationConfig& generation_config = {} + ); + OPENVINO_DEPRECATED("Please, specify device explicitly when create LLMPipeline. This overload will be removed in 2025.0.0 release") explicit LLMPipeline(const std::filesystem::path& path) : LLMPipeline(path, "CPU") { } @@ -274,6 +283,14 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { OPENVINO_GENAI_EXPORTS std::pair streamer(StreamerVariant func); OPENVINO_GENAI_EXPORTS std::pair generation_config(const GenerationConfig& config); +OPENVINO_GENAI_EXPORTS std::pair draft_model( + std::string& model_str, + ov::Tensor& weights_tensor, + const ov::genai::Tokenizer& tokenizer, + const std::string& device = {}, + const ov::AnyMap& properties = {}, + const ov::genai::GenerationConfig& generation_config = {}); + OPENVINO_GENAI_EXPORTS std::pair draft_model( const std::filesystem::path& models_path, const std::string& device = {}, diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 36f63d2b5e..38fc0aaf8c 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -28,12 +28,72 @@ struct TokenizedInputs { class OPENVINO_GENAI_EXPORTS Tokenizer { public: /** - * @brief ov::genai::Tokenizer constructor. - * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path - * @param properties Properties passed to ov::Core::compile_model - */ - Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {}); + * @brief ov::genai::Tokenizer constructor. + * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path + * @param properties Properties passed to ov::Core::compile_model + */ + explicit Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {}); + + /** + * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights + * + * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory. + * When this constructor is used bos, eos, pad token ids are expected to be in IR. + * If an IR is older (< 2024.3) then this tokens are default initialized to be ignored. + * @param tokenizer_model_str tokenizer model string + * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights + * @param detokenizer_model_str detokenizer model string + * @param detokenizer_weights_tensor ov::Tensor with detokenizer weights + * @param properties Properties passed to ov::Core::compile_model + */ + Tokenizer( + const std::string& tokenizer_model_str, + ov::Tensor& tokenizer_weights_tensor, + std::string& detokenizer_model_str, + ov::Tensor& detokenizer_weights_tensor, + const ov::AnyMap& properties = {} + ); + + /** + * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. + * + * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's + * tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids + * are expected to be in IR. If an IR is older (< 2024.3) then this tokens are default initialized to be ignored. + * @param model_str model string + * @param weights_tensor ov::Tensor with model weights + * @param properties Properties passed to ov::Core::compile_model + */ + Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties = {}); + /** + * @brief ov::genai::Tokenizer constructor with variable number of properties + * @param tokenizer_model_str tokenizer model string + * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights + * @param detokenizer_model_str detokenizer model string + * @param detokenizer_weights_tensor ov::Tensor with detokenizer weights + * @param properties optional properties + */ + template ::value, bool>::type = true> + Tokenizer( + const std::string& tokenizer_model_str, + ov::Tensor& tokenizer_weights_tensor, + std::string& detokenizer_model_str, + ov::Tensor& detokenizer_weights_tensor, + Properties&&... properties + ) : Tokenizer(tokenizer_model_str, tokenizer_weights_tensor, detokenizer_model_str, detokenizer_weights_tensor, ov::AnyMap{std::forward(properties)...}) { } + + /** + * @brief ov::genai::Tokenizer constructor with variable number of properties + * @param model_str model string + * @param weights_tensor ov::Tensor with model weights + * @param properties optional properties + */ + template ::value, bool>::type = true> + Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, + Properties&&... properties) + : Tokenizer(model_str, weights_tensor, ov::AnyMap{std::forward(properties)...}) { } + /** * @brief ov::genai::Tokenizer constructor with variable number of properties * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 901c5c64be..d27e8934dc 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -11,22 +11,21 @@ template struct overloaded : Ts... {using Ts::operator()...;}; template overloaded(Ts...) -> overloaded; ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( - const std::filesystem::path& models_path, + const std::shared_ptr& model, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& properties) { + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config + ) { m_tokenizer = tokenizer; - m_generation_config = utils::from_config_json_if_exists(models_path); - + m_generation_config = generation_config; + ov::Core core; auto [core_properties, compile_properties] = utils::split_core_compile_config(properties); core.set_property(core_properties); - // The model can be compiled for GPU as well - std::shared_ptr model = core.read_model((models_path / "openvino_model.xml").string()); - DeviceConfig device_config(core, scheduler_config, device, compile_properties); bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction; diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 8276edb36b..780bff6a31 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -53,22 +53,12 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void _fill_prompt_log_probs(std::vector& sequence_groups, ov::Tensor& logits); public: - ContinuousBatchingImpl(const std::filesystem::path& models_path, + ContinuousBatchingImpl(const std::shared_ptr& model, const Tokenizer& tokenizer, - const SchedulerConfig& scheduler_config, - const std::string& device, - const ov::AnyMap& properties); - - ContinuousBatchingImpl(const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& properties, - const ov::AnyMap& tokenizer_properties) - : ContinuousBatchingImpl{ models_path, - Tokenizer(models_path, tokenizer_properties), - scheduler_config, - device, - properties } {} + const ov::genai::GenerationConfig& generation_config); GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index 6dcbf342eb..2faad4354e 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -20,7 +20,7 @@ using namespace ov::genai; inline ov::genai::ModelDesc extract_draft_model_from_config(ov::AnyMap& config) { - ov::genai::ModelDesc draft_model(""); + ov::genai::ModelDesc draft_model; if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) { draft_model = config.at(utils::DRAFT_MODEL_ARG_NAME).as(); config.erase(utils::DRAFT_MODEL_ARG_NAME); @@ -28,17 +28,24 @@ extract_draft_model_from_config(ov::AnyMap& config) { return draft_model; } + ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& properties, const ov::AnyMap& tokenizer_properties) { auto properties_without_draft_model = properties; - auto draft_model = extract_draft_model_from_config(properties_without_draft_model); - if (draft_model.models_path.empty()) { - m_impl = std::make_shared(models_path, scheduler_config, device, properties, tokenizer_properties); + auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); + + std::filesystem::path openvino_model_name = "openvino_model.xml"; + auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string()); + auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties); + auto generation_config = utils::from_config_json_if_exists(models_path); + if (draft_model_desr.model == nullptr) { + m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } else { - m_impl = std::make_shared(models_path, scheduler_config, device, properties_without_draft_model, draft_model, tokenizer_properties); + auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); + m_impl = std::make_shared(main_model_descr, draft_model_desr); } } @@ -49,11 +56,36 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& device, const ov::AnyMap& properties) { auto properties_without_draft_model = properties; - auto draft_model = extract_draft_model_from_config(properties_without_draft_model); - if (draft_model.models_path.empty()) { - m_impl = std::make_shared(models_path, tokenizer, scheduler_config, device, properties); + auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); + std::filesystem::path openvino_model_name = "openvino_model.xml"; + auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string()); + auto generation_config = utils::from_config_json_if_exists(models_path); + + if (draft_model_desr.model == nullptr) { + m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); + } else { + auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); + m_impl = std::make_shared(main_model_descr, draft_model_desr); + } +} + +ContinuousBatchingPipeline::ContinuousBatchingPipeline( + const std::string& model_str, + const ov::Tensor& weights_tensor, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config) { + auto properties_without_draft_model = properties; + auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); + auto model = utils::singleton_core().read_model(model_str, weights_tensor); + + if (draft_model_desr.model == nullptr) { + m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } else { - m_impl = std::make_shared(models_path, scheduler_config, device, properties_without_draft_model, draft_model); + auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); + m_impl = std::make_shared(main_model_descr, draft_model_desr); } } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 5d82a96010..84f76730eb 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -59,23 +59,31 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& plugin_config - ) : LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(models_path)) - { + ) : StatefulLLMPipeline{ + ov::genai::utils::read_model_with_config(models_path, plugin_config), + tokenizer, + device, + plugin_config, + utils::from_config_json_if_exists(models_path) + } {} + + StatefulLLMPipeline( + const std::shared_ptr& model, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& config, + const ov::genai::GenerationConfig& generation_config + ) : LLMPipelineImplBase(tokenizer, generation_config) { ov::Core core; + auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config); + utils::slice_matmul_statefull_model(model); + if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(*filtered_plugin_config); - core.set_property(core_plugin_config); - auto model = core.read_model(models_path / "openvino_model.xml"); m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model."); m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device); // TODO: Make the prefix name configurable - utils::slice_matmul_statefull_model(model); - m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); + m_model_runner = core.compile_model(model, device, *filtered_plugin_config).create_infer_request(); } else { - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(plugin_config); - core.set_property(core_plugin_config); - auto model = core.read_model(models_path / "openvino_model.xml"); - utils::slice_matmul_statefull_model(model); - m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); + m_model_runner = core.compile_model(model, device, plugin_config).create_infer_request(); } // If eos_token_id was not provided, take value @@ -87,7 +95,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& plugin_config - ) : StatefulLLMPipeline{models_path, Tokenizer(models_path.string()), device, plugin_config} {} + ) : StatefulLLMPipeline{models_path, Tokenizer(models_path), device, plugin_config} {} DecodedResults generate( StringInputs inputs, @@ -382,14 +390,26 @@ std::pair draft_model( const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties) { - ov::AnyMap plugin_config = properties; - auto it = plugin_config.find(ov::genai::scheduler_config.name()); - SchedulerConfig scheduler_config; - if (it != plugin_config.end()) { - scheduler_config = it->second.as(); - plugin_config.erase(it); - } - return { utils::DRAFT_MODEL_ARG_NAME, Any::make(models_path, device, plugin_config, scheduler_config) }; + auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); + + std::filesystem::path openvino_model_name = "openvino_model.xml"; + auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string()); + auto generation_config = utils::from_config_json_if_exists(models_path); + auto tokenizer = ov::genai::Tokenizer(models_path); + return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model, tokenizer, device, plugin_config, scheduler_config, generation_config) }; +} + +std::pair draft_model( + std::string& model_str, + ov::Tensor& weights_tensor, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config) { + auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); + + auto model = utils::singleton_core().read_model(model_str, weights_tensor); + return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model, tokenizer, device, plugin_config, scheduler_config, generation_config) }; } } // namespace genai @@ -431,6 +451,23 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { m_generation_config = m_impl.get_config(); } + ContinuousBatchingAdapter( + const std::string& model_str, + const ov::Tensor& weights_tensor, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& plugin_config, + const ov::genai::GenerationConfig& generation_config + ): LLMPipelineImplBase{tokenizer}, m_impl{ + model_str, + weights_tensor, + tokenizer, + scheduler_config, + device, + plugin_config, + generation_config} {} + ContinuousBatchingAdapter( const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, @@ -543,6 +580,29 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { m_impl.finish_chat(); }; }; + +/* +* NPU reads some properties from the config file, but when LLMPipeline is initialized +* from the model_str and weights_tensor, there are not files. +* In the later case ModelDesc is stored in properties. +* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr. +*/ +std::pair split_model_descr(const ov::AnyMap& properties) { + ov::AnyMap main_properties = properties; + ov::genai::ModelConfigDesc model_descr; + + auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) { + if (orig_propertis.find(key) != orig_propertis.end()) { + value = orig_propertis.at(key).as>(); + orig_propertis.erase(key); + } + }; + pop_property(main_properties, "name_or_path", model_descr.name_or_path); + pop_property(main_properties, "type", model_descr.type); + pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads); + + return {main_properties, model_descr}; +} } ov::genai::LLMPipeline::LLMPipeline( @@ -564,11 +624,9 @@ ov::genai::LLMPipeline::LLMPipeline( ){ auto start_time = std::chrono::steady_clock::now(); if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) { - auto config_without_scheduler_config = properties; - config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); - auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as(); - m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, config_without_scheduler_config); - } else if ("NPU" == device) { + auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); + m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, plugin_config); + } else if (device == "NPU") { m_pimpl = std::make_unique(models_path, tokenizer, device, properties); } else { m_pimpl = std::make_unique(models_path, tokenizer, device, properties); @@ -583,12 +641,11 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::AnyMap& config ){ auto start_time = std::chrono::steady_clock::now(); + if (config.find(ov::genai::scheduler_config.name()) != config.end()) { - auto config_without_scheduler_config = config; - config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); - auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as(); - m_pimpl = std::make_unique(models_path, scheduler_config, device, config_without_scheduler_config); - } else if ("NPU" == device) { + auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config); + m_pimpl = std::make_unique(models_path, scheduler_config, device, plugin_config); + } else if (device == "NPU") { m_pimpl = std::make_unique(models_path, device, config); } else { m_pimpl = std::make_unique(models_path, device, config); @@ -597,6 +654,55 @@ ov::genai::LLMPipeline::LLMPipeline( m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } +ov::genai::LLMPipeline::LLMPipeline( + const std::string& model_str, + const ov::Tensor& weights_tensor, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& config, + const ov::genai::GenerationConfig& generation_config +){ + auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config); + + auto start_time = std::chrono::steady_clock::now(); + if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) { + auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config); + m_pimpl = std::make_unique(model_str, weights_tensor, + tokenizer, scheduler_config, device, plugin_config_, generation_config); + } else if (device == "NPU") { + // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution. + // NPU reads some properties from the config file, but when LLMPipeline is initialized + // from the model_str and weights_tensor, there is no files. + // Therefore, we need to pass these properties manually. + // This is necessary only for NPU, for other plugins can be ommited. + // Example of usage: + // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, + // {"type", "llama"}, + // {"num_key_value_heads", 32}}; + // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties); + // This will convert from AnyMap to ModelDesc. + auto [properties, model_descr] = split_model_descr(plugin_config); + + m_pimpl = std::make_unique( + utils::singleton_core().read_model(model_str, weights_tensor), + model_descr, + tokenizer, + device, + properties, + generation_config + ); + } else { + m_pimpl = std::make_unique( + utils::singleton_core().read_model(model_str, weights_tensor), + tokenizer, + device, + plugin_config, + generation_config); + } + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); +} + ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { return m_pimpl->m_generation_config; } diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 4bb84dfc05..cb83209b4b 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -314,7 +314,7 @@ std::optional pop_option(ov::AnyMap& config, const std::string& option_ } template -std::optional get_option(ov::AnyMap& config, const std::string& option_name) { +std::optional get_option(const ov::AnyMap& config, const std::string& option_name) { if (auto it = config.find(option_name); it != config.end()) { return std::make_optional(it->second.as()); } @@ -396,18 +396,12 @@ KVAxesPosition get_kv_axes(const std::string& model_type) { return axes; } -struct ModelDesc { - std::string type; - std::string name_or_path; - int num_key_value_heads; -}; - -ModelDesc get_modeldesc_from_json(const std::filesystem::path& filepath) { +ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) { std::ifstream file(filepath); OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string()); nlohmann::json config_data = nlohmann::json::parse(file); - ModelDesc desc; + ov::genai::ModelConfigDesc desc; desc.type = config_data["model_type"].get(); // NB: In case _name_or_path field isn't presented in config.json if (config_data.contains("_name_or_path")) { @@ -664,7 +658,9 @@ StaticLLMPipeline::StaticLLMPipeline( */ const auto use_blobs = pop_or_default(properties, "USE_BLOBS", false); if (!use_blobs) { - setupAndCompileModels(models_path, device, properties); + ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); + auto model = genai::utils::singleton_core().read_model((models_path / "openvino_model.xml").string()); + setupAndCompileModels(model, device, model_desc, properties); } else { setupAndImportModels(models_path, device, properties); } @@ -684,9 +680,39 @@ StaticLLMPipeline::StaticLLMPipeline( ) : StaticLLMPipeline(models_path, Tokenizer(models_path), device, properties) { } +StaticLLMPipeline::StaticLLMPipeline( + const std::shared_ptr& model, + const ModelConfigDesc& model_desc, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config +) : LLMPipelineImplBase(tokenizer, generation_config) { + + bool use_blobs = false; + auto anyopt = get_option(properties, "USE_BLOBS"); + if (anyopt.has_value()) { + use_blobs = *anyopt; + } + // Using model_str and weights_tesnor with blobs is meaningless. + OPENVINO_ASSERT(!use_blobs, "blobs cannot be used with model string and weights tensor"); + + auto properties_ = properties; + setupAndCompileModels(model, device, model_desc, properties_); + + // Initialize tensors + prepare_for_new_conversation(); + + // If eos_token_id was not provided, take value + if (m_generation_config.eos_token_id == -1) { + m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); + } +} + void StaticLLMPipeline::setupAndCompileModels( - const std::filesystem::path& models_path, + const std::shared_ptr& model, const std::string& device, + const ModelConfigDesc& model_desc, ov::AnyMap& properties) { /* Initialization assumes multiple steps if user passes "USE_BLOBS=NO": 1) Read the template model - this will be kvcache model @@ -705,7 +731,7 @@ void StaticLLMPipeline::setupAndCompileModels( // NB: Get information about NPU if available auto npudesc = extract_npu_descriptor(core); // (1) Read the template model - this will be kvcache model - auto kvcache_model = core.read_model((models_path / "openvino_model.xml").string()); + auto kvcache_model = model; // (2) Expose KV-cache input and output layers from kvcache model ov::pass::StatefulToStateless().run_on_model(kvcache_model); // (3) Align u4 ZP constants @@ -716,7 +742,7 @@ void StaticLLMPipeline::setupAndCompileModels( // (5) Reshape both models to static shape const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u); const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u); - ModelDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); + KVAxesPosition axes = get_kv_axes(model_desc.type); m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len, false}; reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index d8e59d867a..7acc28c684 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -10,6 +10,12 @@ namespace ov { namespace genai { +struct ModelConfigDesc { + std::string type; + std::string name_or_path; + int num_key_value_heads; +}; + class StaticLLMPipeline final : public LLMPipelineImplBase { public: StaticLLMPipeline( @@ -19,6 +25,15 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { const ov::AnyMap& config ); + StaticLLMPipeline( + const std::shared_ptr& model, + const ModelConfigDesc& model_desc, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config = {} + ); + StaticLLMPipeline( const std::filesystem::path& path, const std::string& device, @@ -26,8 +41,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { ); void setupAndCompileModels( - const std::filesystem::path& path, + const std::shared_ptr& model, const std::string& device, + const ModelConfigDesc& model_desc, ov::AnyMap& pipeline_config); void setupAndImportModels( diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index ecce79ac4e..2be67320a9 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -23,27 +23,22 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) { lhs.get_bos_token_id() == rhs.get_bos_token_id() && lhs.get_pad_token_id() == rhs.get_pad_token_id(); } -ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl( - const std::filesystem::path& main_models_path, - const SchedulerConfig& main_scheduler_config, - const std::string& main_device, - const ov::AnyMap& main_properties, - const ov::genai::ModelDesc draft_model_desc, - const ov::AnyMap& tokenizer_properties) { +ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, + const ov::genai::ModelDesc& draft_model_desc) { ov::Core core; - auto [core_properties, compile_properties] = utils::split_core_compile_config(main_properties); + auto [core_properties, compile_properties] = utils::split_core_compile_config(main_model_desc.properties); core.set_property(core_properties); - std::filesystem::path openvino_model_name = "openvino_model.xml", - draft_models_path = draft_model_desc.models_path; + auto main_model = main_model_desc.model; + auto draft_model = draft_model_desc.model; - std::shared_ptr main_model = core.read_model((main_models_path / openvino_model_name).string()), - draft_model = core.read_model((draft_models_path / openvino_model_name).string()); + auto main_scheduler_config = main_model_desc.scheduler_config; + auto main_device = main_model_desc.device; - utils::apply_paged_attention_transformations(main_model, main_scheduler_config.use_cache_eviction); - utils::apply_paged_attention_transformations(draft_model, main_scheduler_config.use_cache_eviction); + utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction); + utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction); - std::string draft_device = draft_model_desc.device.empty() ? main_device : draft_model_desc.device; + std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device; bool is_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig(); @@ -76,8 +71,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl( // main and draft model can have different tokenizers // to do: support retokenization: 154103 - Tokenizer main_model_tokenizer(main_models_path, tokenizer_properties), - draft_model_tokenizer(draft_models_path, tokenizer_properties); + Tokenizer main_model_tokenizer = main_model_desc.tokenizer; + Tokenizer draft_model_tokenizer = draft_model_desc.tokenizer; // todo: remove this condition after support of CVS-154103 OPENVINO_ASSERT(are_tokenizers_equal(main_model_tokenizer, draft_model_tokenizer), "Tokenizers for draft and main models are different!"); @@ -86,10 +81,10 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl( // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode m_main_pipeline = std::make_shared(core, - main_model, main_model_tokenizer, utils::from_config_json_if_exists(main_models_path), + main_model, main_model_tokenizer, main_model_desc.generation_config, main_device_config, main_scheduler_config, main_device, compile_properties, true); m_draft_pipeline = std::make_shared(core, - draft_model, draft_model_tokenizer, utils::from_config_json_if_exists(draft_models_path), + draft_model, draft_model_tokenizer, draft_model_desc.generation_config, draft_device_config, draft_scheduler_config, draft_device, draft_properties, false); } diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp index f854713b5e..3df02ac394 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp @@ -11,19 +11,27 @@ namespace ov::genai { struct ModelDesc { - std::filesystem::path models_path; std::string device; ov::genai::SchedulerConfig scheduler_config; ov::AnyMap properties; + ov::genai::GenerationConfig generation_config; + std::shared_ptr model = nullptr; + ov::genai::Tokenizer tokenizer; - ModelDesc(const std::filesystem::path& models_path, + ModelDesc(const std::shared_ptr& model, + const ov::genai::Tokenizer& tokenizer, const std::string& device = {}, const ov::AnyMap& properties = {}, - const ov::genai::SchedulerConfig& scheduler_config = {}) : - models_path(models_path), + const ov::genai::SchedulerConfig& scheduler_config = {}, + const ov::genai::GenerationConfig& generation_config = {}) : + model(model), + tokenizer(tokenizer), device(device), properties(properties), - scheduler_config(scheduler_config) {} + scheduler_config(scheduler_config), + generation_config(generation_config) {} + + ModelDesc() = default; }; class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface { @@ -35,12 +43,7 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat std::map m_draft_generations; public: - SpeculativeDecodingImpl(const std::filesystem::path& main_models_path, - const SchedulerConfig& scheduler_config, - const std::string& device, - const ov::AnyMap& properties, - const ov::genai::ModelDesc draft_model_desc, - const ov::AnyMap& tokenizer_properties = {}); + SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc); GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 41f9a6abd4..23ab3f1363 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -67,6 +67,19 @@ constexpr char bos_token_key_name[] = "bos_token"; constexpr char eos_token_key_name[] = "eos_token"; constexpr char pad_token_key_name[] = "pad_token"; +ov::Core core_with_extension() { + ov::Core core; + const char* ov_tokenizer_path = getenv(ScopedVar::ENVIRONMENT_VARIABLE_NAME); + OPENVINO_ASSERT(ov_tokenizer_path, "openvino_tokenizers path is not set"); + core.add_extension(ov_tokenizer_path); + return core; +} + +ov::Core get_core_singleton() { + static ov::Core core = core_with_extension(); + return core; +} + } // namespace namespace ov { @@ -76,7 +89,7 @@ class Tokenizer::TokenizerImpl { public: ov::CompiledModel m_tokenizer; ov::CompiledModel m_detokenizer; - + std::unique_ptr> m_ireq_queue_tokenizer; std::unique_ptr> m_ireq_queue_detokenizer; // To change the adding special tokens mode we use a statefull subgraph, @@ -135,65 +148,101 @@ class Tokenizer::TokenizerImpl { TokenizerImpl() = default; - TokenizerImpl(std::filesystem::path tokenizer_path, const ov::AnyMap& properties) - : m_chat_template{chat_template_from_tokenizer_json_if_exists(tokenizer_path)} { - ov::Core core; - - OPENVINO_ASSERT(tokenizer_path.extension() != ".xml", "'tokenizer_path' parameter should be a path to a dir not a xml file"); + TokenizerImpl(const std::filesystem::path& models_papth, const ov::AnyMap& properties) { + setupTokenizer(models_papth, properties); + } - const char* ov_tokenizer_path = getenv(ScopedVar::ENVIRONMENT_VARIABLE_NAME); - OPENVINO_ASSERT(ov_tokenizer_path, "openvino_tokenizers path is not set"); - core.add_extension(ov_tokenizer_path); + TokenizerImpl(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { + setupTokenizer(models, properties); + } - read_config(tokenizer_path); - read_special_tokens_map(tokenizer_path); + void setupTokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) { + ScopedVar env_manager(tokenizers_relative_to_genai().string()); + auto core = get_core_singleton(); - // Try to read tokenizer_config if some token ids or token str are not defined. - read_tokenizer_config_if_necessary(tokenizer_path); + OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_papth' parameter should be a path to a dir not a xml file"); - auto device = "CPU"; // currently openvino_tokenizer supports only CPU - auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml"); + std::shared_ptr ov_tokenizer = nullptr; std::shared_ptr ov_detokenizer = nullptr; - if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) { - ov_detokenizer = core.read_model(tokenizer_path / "openvino_detokenizer.xml"); + + if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) { + ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml"); } - m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1; - ov::pass::Manager manager_tok; - manager_tok.register_pass(); - manager_tok.run_passes(ov_tokenizer); + if (std::filesystem::exists(models_path / "openvino_detokenizer.xml")) { + ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml"); + } + + setupTokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties); + + // If special tokens were not found from IR, try to read them from config. + // This will be triggered only for IRs older than 2024.3. + if (m_pad_token_id == -1 || m_bos_token_id == -1 || m_eos_token_id == -1 || + m_pad_token.empty() || m_bos_token.empty() || m_eos_token.empty()) { + read_config(models_path); + read_special_tokens_map(models_path); + // Try to read tokenizer_config if some token ids or token str are not defined. + read_tokenizer_config_if_necessary(models_path); + } - m_tokenizer = core.compile_model(ov_tokenizer, device, properties); + // If chat_template was not found in IR, try to read them from config. + if (m_chat_template.empty()) { + m_chat_template = chat_template_from_tokenizer_json_if_exists(models_path); + } + } + + + void setupTokenizer(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { + auto [ov_tokenizer, ov_detokenizer] = models; + + m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1; + auto core = get_core_singleton(); + std::string device = "CPU"; // only CPU is supported for now + if (ov_tokenizer) { + ov::pass::Manager manager; + manager.register_pass(); + manager.run_passes(ov_tokenizer); + m_tokenizer = core.compile_model(ov_tokenizer, device, properties); + + m_ireq_queue_tokenizer = std::make_unique>( + m_tokenizer.get_property(ov::optimal_number_of_infer_requests), + [this]() -> ov::InferRequest { + return std::move(this->m_tokenizer.create_infer_request()); + }); + } + if (ov_detokenizer) { ov::pass::Manager manager_detok; manager_detok.register_pass(); manager_detok.run_passes(ov_detokenizer); m_detokenizer = core.compile_model(ov_detokenizer, device, properties); - } - const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests); - m_ireq_queue_tokenizer = std::make_unique>( - INFER_REQUEST_QUEUE_SIZE, - [this]() -> ov::InferRequest { - return std::move(this->m_tokenizer.create_infer_request()); - }); - if (m_detokenizer) { m_ireq_queue_detokenizer = std::make_unique>( - INFER_REQUEST_QUEUE_SIZE, + m_detokenizer.get_property(ov::optimal_number_of_infer_requests), [this]() -> ov::InferRequest { return std::move(this->m_detokenizer.create_infer_request()); }); } - - // Get special token ids by inference if they are not defined. - infer_special_tokens_if_necessary(); + // Initialize tokenizer's cache to save time later. - // infer_special_tokens_if_necessary() already could do that - // but it didn't run decode() for sure. - // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup. - auto tokenized_input = encode("non empty string").input_ids; + if (m_tokenizer) { + // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup. + encode("non empty string").input_ids; if (m_detokenizer) - decode(tokenized_input); + decode({1, 33, 199, 42, 42}); + } + + utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template); + utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id); + utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id); + utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id); + + m_chat_template = patch_chat_template(m_chat_template); + if (m_detokenizer) { + m_pad_token = decode(std::vector{m_pad_token_id}); + m_bos_token = decode(std::vector{m_bos_token_id}); + m_eos_token = decode(std::vector{m_eos_token_id}); + } } // load special tokens ids from config.json @@ -453,7 +502,7 @@ class Tokenizer::TokenizerImpl { std::string res; ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); - + return patch_chat_template(res); } @@ -518,10 +567,40 @@ class Tokenizer::TokenizerImpl { }; Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties) { - ScopedVar env_manager(tokenizers_relative_to_genai().string()); m_pimpl = std::make_shared(tokenizer_path, properties); } +Tokenizer::Tokenizer( + const std::string& tokenizer_model_str, + ov::Tensor& tokenizer_weights_tensor, + std::string& detokenizer_model_str, + ov::Tensor& detokenizer_weights_tensor, + const ov::AnyMap& properties +) { + ScopedVar env_manager(tokenizers_relative_to_genai().string()); + auto core = get_core_singleton(); + + auto ov_tokenizer = core.read_model(tokenizer_model_str, tokenizer_weights_tensor); + auto ov_detokenizer = core.read_model(detokenizer_model_str, detokenizer_weights_tensor); + m_pimpl = std::make_shared(std::make_pair(ov_tokenizer, ov_detokenizer), properties); +} + +Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties) { + ScopedVar env_manager(tokenizers_relative_to_genai().string()); + auto core = get_core_singleton(); + auto model = core.read_model(model_str, weights_tensor); + + auto parameters = model->get_parameters(); + OPENVINO_ASSERT(!parameters.empty()); + if (parameters.front()->get_element_type() == ov::element::string) { + // It's a tokenizer + m_pimpl = std::make_shared(std::make_pair(model, nullptr), properties); + } else { + // It's a detokenizer + m_pimpl = std::make_shared(std::make_pair(nullptr, model), properties); + } +} + TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) { check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()}); return m_pimpl->encode(std::move(prompt), tokenization_params); @@ -557,6 +636,7 @@ std::vector Tokenizer::decode(std::vector> lin return m_pimpl->decode(lines, detokenization_params); } + int64_t Tokenizer::get_bos_token_id() const { return m_pimpl->m_bos_token_id; } diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index f1718a8a5d..337b0ab47e 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -219,6 +219,29 @@ std::pair split_core_compile_config(const ov::AnyMap& pr return {core_properties, compile_properties}; }; +/** + * scheduler_config is a separate config for continuous batching pipeline. + * This routine splits scheduler_config from plugin_config. + */ +std::pair split_scheduler_config(const ov::AnyMap& properties) { + ov::AnyMap plugin_config = properties; + auto it = plugin_config.find(ov::genai::scheduler_config.name()); + SchedulerConfig scheduler_config; + if (it != plugin_config.end()) { + scheduler_config = it->second.as(); + plugin_config.erase(it); + } + return {plugin_config, scheduler_config}; +}; + +std::shared_ptr read_model_with_config(const std::filesystem::path& models_path, const ov::AnyMap& properties) { + auto [core_properties, compile_properties] = split_core_compile_config(properties); + ov::Core core; + core.set_property(core_properties); + std::filesystem::path openvino_model_name = "openvino_model.xml"; + return core.read_model((models_path / openvino_model_name).string()); +} + ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) { auto minuend_size = minuend.input_ids.get_size(); auto subtrahend_size = subtrahend.input_ids.get_size(); @@ -261,6 +284,23 @@ void slice_matmul_statefull_model(std::shared_ptr model) { } } +template +void read_rt_info(std::shared_ptr& model, const char* name, T& value) { + if (!model) + return; + if (model->get_rt_info().count(name) == 0) + return; + auto str_value = model->get_rt_info().at(name).as(); + if constexpr (std::is_same::value) { + value = std::stoll(str_value); + } else if constexpr (std::is_same::value) { + value = str_value; + } +} + +template void read_rt_info(std::shared_ptr&, const char*, int64_t&); +template void read_rt_info(std::shared_ptr&, const char*, std::string&); + ov::Core singleton_core() { static ov::Core core; return core; diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index fb58022d5f..be6f0e9442 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -78,7 +78,11 @@ ProcessorConfig from_any_map( const ProcessorConfig& initial ); + std::pair split_core_compile_config(const ov::AnyMap& properties); +std::pair split_scheduler_config(const ov::AnyMap& properties); + +std::shared_ptr read_model_with_config(const std::filesystem::path& models_path, const ov::AnyMap& properties); ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend); diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 5730def0c6..b633497d32 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -223,6 +223,38 @@ def model_tmp_path(tmpdir_factory): shutil.copy(src_file, temp_path / src_file.name) yield model_id, Path(temp_path) +@pytest.fixture(scope="module") +def model_tokenizers_path_tmp_path(tmpdir_factory): + model_id, path, _, _, _ = read_model(get_models_list()[0]) + temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) + + # If tokens were not found in IR, it fallback to reading from config. + # There was no easy way to add tokens to IR in tests, so we remove them + # and set tokens in configs and to check if they are read and validated correctly. + import openvino as ov + + # copy openvino converted model and tokenizers + for pattern in ['*.xml', '*.bin']: + for src_file in path.glob(pattern): + core = ov.Core() + + # Update files if they are openvino_tokenizer.xml or openvino_detokenizer.xml + if src_file.name in ['openvino_tokenizer.xml', 'openvino_detokenizer.xml']: + if src_file.exists(): + # Load the XML content + ov_model = core.read_model(src_file) + # Add empty rt_info so that tokens will be read from config instead of IR + ov_model.set_rt_info("pad_token_id", "") + ov_model.set_rt_info("eos_token_id", "") + ov_model.set_rt_info("chat_template", "") + ov.save_model(ov_model, str(temp_path / src_file.name)) + + if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']: + continue + if src_file.is_file(): + shutil.copy(src_file, temp_path / src_file.name) + yield model_id, Path(temp_path) + def load_tok(configs: List[Tuple], temp_path): # load Tokenizer where all configs are cleared. diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py index 68c25e5391..9260e671d6 100644 --- a/tests/python_tests/test_chat_generate_api.py +++ b/tests/python_tests/test_chat_generate_api.py @@ -158,6 +158,7 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): **tokenizer_config) tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1]) + tok.set_chat_template(tokenizer_config['chat_template']) full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False) if full_history_str != full_history_str_hf: print(f'hf reference: {full_history_str_hf}') diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index d17f3c0232..d15747be63 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -509,7 +509,8 @@ def test_load_special_tokens_str_2(model_tmp_path): @pytest.mark.precommit @pytest.mark.nightly -def test_load_special_tokens_3_(model_tmp_path): +@pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons") +def test_load_special_tokens_3_(model_tokenizers_path_tmp_path): # special_tokens_map is not available # but tokenize_config.json exists # will load both string and integer representations @@ -524,7 +525,7 @@ def test_load_special_tokens_3_(model_tmp_path): "eos_token": "", } - tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tmp_path[1]) + tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1]) assert tok.get_pad_token() == tok_config_json['pad_token'] assert tok.get_bos_token() == tok_config_json['bos_token'] assert tok.get_eos_token() == tok_config_json['eos_token'] @@ -605,7 +606,8 @@ def test_load_special_tokens_4(model_tmp_path): invalid_configs = [ dict(num_beam_groups=3, num_beams=15, do_sample=True), - dict(do_sample=True), # no eos_token_id no max_new_tokens, no max_len + # TODO: CVS-158682 eos_token_id is still read from tiny-random-phi3 and we cannot modify RTInfo in tests + # dict(do_sample=True), # no eos_token_id no max_new_tokens, no max_len dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp