From 9902928edfc2c6ce6127072bcaf6ff4b7d37be1b Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 7 Jun 2024 20:57:47 +0200 Subject: [PATCH] Generate pipeline (#334) LLM return logits with probabilities of each token, these probabilities can be converted to tokens/words with different technics: greedy decoding, beam search decoding, random sampling, etc. This requires writing user unfriendly post-processing even for the simplest scenario of greedy decoding. In order to make live easier we we combined all decoding scenarios into a single function call, where the decoding method and parameters are specified by arguments. In this PR we provide a user friendly API for text generation inspired by `generate` method from HuggingFace transformers library. - [x] enable calling tokenizers/detokenizers from LLMPipeline - [ ] add callback for streaming mode - done partially, need to improve - [x] rewritten samples with the current approach: [causal_lm/cpp/generate_pipeline/generate_sample.cpp#L73-L83](https://github.com/pavel-esir/openvino.genai/blob/generate_pipeline/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp#L73-L83) - [x] Multibatch greedy decoding - [ ] Speculative decoding - [ ] Grouped Beam Search decoding: ready for batch 1, need to rebase multibatch support after merging https://github.com/openvinotoolkit/openvino.genai/pull/349 - [x] Random sampling Example 1: Greedy search generation ``` LLMPipeline pipe(model_path, device); // Will try to load config from generation_config.json. // but if not found default velues for gready search will be used GenerationConfig config = pipe.generation_config(); cout << pipe(prompt, config.max_new_tokens(20)); ``` Example 2: TextStreaming mode ``` LLMPipeline pipe(model_path, device); GenerationConfig config = pipe.generation_config(); auto text_streamer = TextStreamer{pipe}; auto text_streamer_callback = [&text_streamer](std::vector&& tokens, LLMPipeline& pipe){ text_streamer.put(tokens[0]); }; pipe(prompt, config.max_new_tokens(20).set_callback(text_streamer_callback)); text_streamer.end(); ``` CVS-132907 CVS-137920 --------- Co-authored-by: Wovchena Co-authored-by: Ilya Lavrenov Co-authored-by: Alexander Suvorov Co-authored-by: Yaroslav Tarkan Co-authored-by: Xiake Sun Co-authored-by: wenyi5608 <93560477+wenyi5608@users.noreply.github.com> Co-authored-by: Ekaterina Aidova Co-authored-by: guozhong wang Co-authored-by: Chen Peter --- .github/dependabot.yml | 8 + .github/workflows/causal_lm_cpp.yml | 166 +++--- .github/workflows/genai_package.yml | 98 ++++ .github/workflows/genai_python_lib.yml | 85 +++ .github/workflows/lcm_dreamshaper_cpp.yml | 8 +- .../workflows/stable_diffusion_1_5_cpp.yml | 8 +- .gitignore | 2 +- CMakeLists.txt | 44 ++ .../lcm_dreamshaper_v7/cpp/README.md | 2 +- .../lcm_dreamshaper_v7/cpp/requirements.txt | 2 +- .../stable_diffusion_1_5/cpp/README.md | 2 +- .../stable_diffusion_1_5/cpp/requirements.txt | 2 +- llm_bench/python/benchmark.py | 110 +++- llm_bench/python/utils/model_utils.py | 1 + llm_bench/python/utils/ov_utils.py | 91 ++- llm_bench/python/utils/pt_utils.py | 2 +- pyproject.toml | 49 ++ requirements-build.txt | 2 + .../cpp/beam_search_causal_lm/CMakeLists.txt | 14 + samples/cpp/beam_search_causal_lm/README.md | 22 + .../beam_search_causal_lm.cpp | 33 ++ samples/cpp/chat_sample/CMakeLists.txt | 14 + samples/cpp/chat_sample/README.md | 22 + samples/cpp/chat_sample/chat_sample.cpp | 36 ++ samples/cpp/greedy_causal_lm/CMakeLists.txt | 14 + samples/cpp/greedy_causal_lm/README.md | 22 + .../cpp/greedy_causal_lm/greedy_causal_lm.cpp | 31 + .../cpp/multinomial_causal_lm/CMakeLists.txt | 14 + samples/cpp/multinomial_causal_lm/README.md | 22 + .../multinomial_causal_lm.cpp | 38 ++ .../prompt_lookup_decoding_lm/CMakeLists.txt | 18 + .../cpp/prompt_lookup_decoding_lm/README.md | 25 + .../prompt_lookup_decoding_lm.cpp | 1 + .../cpp/requirements.txt | 2 +- .../speculative_decoding_lm/CMakeLists.txt | 18 + samples/cpp/speculative_decoding_lm/README.md | 29 + .../speculative_decoding_lm.cpp | 0 src/CMakeLists.txt | 13 + src/README.md | 246 ++++++++ .../causal_lm/cpp => src}/beam_idx-drop.gif | Bin .../causal_lm/cpp => src}/beam_idx-fork.gif | Bin src/cpp/CMakeLists.txt | 103 ++++ src/cpp/OpenVINOGenAIConfig.cmake.in | 10 + .../openvino/genai/generation_config.hpp | 125 ++++ .../include/openvino/genai/llm_pipeline.hpp | 214 +++++++ .../include/openvino/genai/streamer_base.hpp | 28 + src/cpp/include/openvino/genai/tokenizer.hpp | 109 ++++ src/cpp/include/openvino/genai/visibility.hpp | 12 + src/cpp/src/generation_config.cpp | 129 ++++ src/cpp/src/greedy_decoding.cpp | 146 +++++ .../cpp/src/group_beam_searcher.cpp | 161 ++++- src/cpp/src/llm_pipeline.cpp | 438 ++++++++++++++ src/cpp/src/multinomial_decoding.cpp | 256 ++++++++ src/cpp/src/text_callback_streamer.cpp | 45 ++ src/cpp/src/text_callback_streamer.hpp | 27 + src/cpp/src/tokenizer.cpp | 428 ++++++++++++++ src/cpp/src/utils.cpp | 160 +++++ src/cpp/src/utils.hpp | 72 +++ src/python/CMakeLists.txt | 51 ++ src/python/openvino_genai/__init__.py | 22 + src/python/openvino_genai/__version__.py | 5 + src/python/py_generate_pipeline.cpp | 264 +++++++++ .../causal_lm/cpp => src}/stateful.jpg | Bin .../causal_lm/cpp => src}/stateless.jpg | Bin tests/python_tests/conftest.py | 10 + tests/python_tests/list_test_models.py | 28 + tests/python_tests/pytest.ini | 7 + tests/python_tests/requirements.txt | 3 + tests/python_tests/test_generate_api.py | 553 ++++++++++++++++++ text_generation/causal_lm/cpp/CMakeLists.txt | 42 -- text_generation/causal_lm/cpp/README.md | 179 ------ .../causal_lm/cpp/beam_search_causal_lm.cpp | 236 -------- .../causal_lm/cpp/greedy_causal_lm.cpp | 133 ----- third-party-programs.txt | 417 +++++++++++++ thirdparty/CMakeLists.txt | 34 ++ thirdparty/openvino_tokenizers | 2 +- 76 files changed, 5054 insertions(+), 711 deletions(-) create mode 100644 .github/workflows/genai_package.yml create mode 100644 .github/workflows/genai_python_lib.yml create mode 100644 CMakeLists.txt create mode 100644 pyproject.toml create mode 100644 requirements-build.txt create mode 100644 samples/cpp/beam_search_causal_lm/CMakeLists.txt create mode 100644 samples/cpp/beam_search_causal_lm/README.md create mode 100644 samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp create mode 100644 samples/cpp/chat_sample/CMakeLists.txt create mode 100644 samples/cpp/chat_sample/README.md create mode 100644 samples/cpp/chat_sample/chat_sample.cpp create mode 100644 samples/cpp/greedy_causal_lm/CMakeLists.txt create mode 100644 samples/cpp/greedy_causal_lm/README.md create mode 100644 samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp create mode 100644 samples/cpp/multinomial_causal_lm/CMakeLists.txt create mode 100644 samples/cpp/multinomial_causal_lm/README.md create mode 100644 samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp create mode 100644 samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt create mode 100644 samples/cpp/prompt_lookup_decoding_lm/README.md rename {text_generation/causal_lm/cpp => samples/cpp/prompt_lookup_decoding_lm}/prompt_lookup_decoding_lm.cpp (99%) rename {text_generation/causal_lm => samples}/cpp/requirements.txt (56%) create mode 100644 samples/cpp/speculative_decoding_lm/CMakeLists.txt create mode 100644 samples/cpp/speculative_decoding_lm/README.md rename {text_generation/causal_lm/cpp => samples/cpp/speculative_decoding_lm}/speculative_decoding_lm.cpp (100%) create mode 100644 src/CMakeLists.txt create mode 100644 src/README.md rename {text_generation/causal_lm/cpp => src}/beam_idx-drop.gif (100%) rename {text_generation/causal_lm/cpp => src}/beam_idx-fork.gif (100%) create mode 100644 src/cpp/CMakeLists.txt create mode 100644 src/cpp/OpenVINOGenAIConfig.cmake.in create mode 100644 src/cpp/include/openvino/genai/generation_config.hpp create mode 100644 src/cpp/include/openvino/genai/llm_pipeline.hpp create mode 100644 src/cpp/include/openvino/genai/streamer_base.hpp create mode 100644 src/cpp/include/openvino/genai/tokenizer.hpp create mode 100644 src/cpp/include/openvino/genai/visibility.hpp create mode 100644 src/cpp/src/generation_config.cpp create mode 100644 src/cpp/src/greedy_decoding.cpp rename text_generation/causal_lm/cpp/group_beam_searcher.hpp => src/cpp/src/group_beam_searcher.cpp (67%) create mode 100644 src/cpp/src/llm_pipeline.cpp create mode 100644 src/cpp/src/multinomial_decoding.cpp create mode 100644 src/cpp/src/text_callback_streamer.cpp create mode 100644 src/cpp/src/text_callback_streamer.hpp create mode 100644 src/cpp/src/tokenizer.cpp create mode 100644 src/cpp/src/utils.cpp create mode 100644 src/cpp/src/utils.hpp create mode 100644 src/python/CMakeLists.txt create mode 100644 src/python/openvino_genai/__init__.py create mode 100644 src/python/openvino_genai/__version__.py create mode 100644 src/python/py_generate_pipeline.cpp rename {text_generation/causal_lm/cpp => src}/stateful.jpg (100%) rename {text_generation/causal_lm/cpp => src}/stateless.jpg (100%) create mode 100644 tests/python_tests/conftest.py create mode 100644 tests/python_tests/list_test_models.py create mode 100644 tests/python_tests/pytest.ini create mode 100644 tests/python_tests/requirements.txt create mode 100644 tests/python_tests/test_generate_api.py delete mode 100644 text_generation/causal_lm/cpp/CMakeLists.txt delete mode 100644 text_generation/causal_lm/cpp/README.md delete mode 100644 text_generation/causal_lm/cpp/beam_search_causal_lm.cpp delete mode 100644 text_generation/causal_lm/cpp/greedy_causal_lm.cpp create mode 100644 third-party-programs.txt create mode 100644 thirdparty/CMakeLists.txt diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9ab4587c2a..85614b7032 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,5 +1,9 @@ version: 2 updates: + - package-ecosystem: "pip" + directory: "./" + schedule: + interval: "weekly" - package-ecosystem: "pip" directory: "image_generation/stable_diffusion_1_5/cpp/scripts/" schedule: @@ -8,6 +12,10 @@ updates: directory: "image_generation/lcm_dreamshaper_v7/cpp/scripts/" schedule: interval: "weekly" + - package-ecosystem: "pip" + directory: "./tests/python_tests/" + schedule: + interval: "weekly" - package-ecosystem: "pip" directory: "text_generation/causal_lm/cpp/" schedule: diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index df03bab7c6..57f7924cce 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -3,7 +3,8 @@ on: pull_request: paths: - .github/workflows/causal_lm_cpp.yml - - text_generation/causal_lm/cpp/* + - src/* + - samples/* - thirdparty/openvino_tokenizers - "!**.md" concurrency: @@ -23,21 +24,20 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: greedy_causal_lm run: | source ./ov/setupvars.sh - ./build/greedy_causal_lm ./open_llama_3b_v2/ "return 0" + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" cpp-beam_search_causal_lm-ubuntu: runs-on: ubuntu-20.04 @@ -51,22 +51,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Compare run: | source ./ov/setupvars.sh - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt + timeout 25s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -74,7 +73,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -82,7 +81,7 @@ jobs: " echo "Why is the Sun yellow?" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt + timeout 25s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -90,7 +89,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('69', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -98,7 +97,7 @@ jobs: " echo "69" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt + timeout 25s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -106,7 +105,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('Hi', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -114,7 +113,7 @@ jobs: " echo "Hi" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt + timeout 25s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -122,7 +121,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('return 0', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -130,7 +129,7 @@ jobs: " echo "return 0" passed - ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt + timeout 25s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -138,7 +137,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -146,7 +145,7 @@ jobs: " echo "你好! 你好嗎?" passed - timeout 1m ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + timeout 1m ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -160,7 +159,7 @@ jobs: for prompt in prompts: tokenized = tokenizer(prompt, return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -168,6 +167,7 @@ jobs: " echo "Multi prompt" passed cpp-beam_search_causal_lm-windows: + if: false runs-on: windows-latest steps: - uses: actions/checkout@v4 @@ -179,29 +179,30 @@ jobs: - name: Install OpenVINO shell: bash run: | - curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/windows/w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64.zip + curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/windows/w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64.zip unzip ov.zip - name: Download, convert and build shell: cmd run: | - call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat - python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Compare shell: cmd run: | - call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat - - .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt + call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat + set PATH=.\build\openvino_genai\;%PATH% + .\build/samples/cpp/greedy_causal_lm/Release/greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt + echo import transformers > ref.py echo predictions = open('pred.txt', 'r').read() >> ref.py echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py echo for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): >> ref.py - echo ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' >> ref.py + echo ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py echo idx = predictions.find(ref) >> ref.py echo if -1 == idx: >> ref.py echo raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py @@ -220,21 +221,20 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Compare run: | source ./ov/setupvars.sh - timeout 50s ./build/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt + timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt cpp-beam_search_causal_lm-Qwen1_5-7B-Chat: runs-on: ubuntu-20.04-16-cores @@ -248,21 +248,20 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Run run: | source ./ov/setupvars.sh - timeout 50s ./build/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" > ./pred_qwen15.txt + timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" > ./pred_qwen15.txt cpp-beam_search_causal_lm-Phi-2: runs-on: ubuntu-20.04-16-cores @@ -276,21 +275,20 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j 15 - name: Compare run: | source ./ov/setupvars.sh - timeout 50s ./build/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt + timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt cpp-beam_search_causal_lm-notus-7b-v1: runs-on: ubuntu-20.04-16-cores @@ -304,21 +302,20 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Compare run: | source ./ov/setupvars.sh - timeout 50s ./build/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt + timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt cpp-speculative_decoding_lm-ubuntu: runs-on: ubuntu-20.04-16-cores @@ -332,23 +329,22 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: run and compare run: | source ./ov/setupvars.sh - ./build/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt - ./build/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt + ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt + ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -371,16 +367,15 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: run and compare run: | @@ -393,8 +388,8 @@ jobs: Question: Can you please add 2 and 3 A:' > ./prompt.txt - ./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt - ./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt + ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt + ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -416,22 +411,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j 15 - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt - timeout 50s ./build/beam_search_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt - name: Compare run: | python -c " @@ -441,7 +435,7 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5') tokenized = tokenizer('Alan Turing was a', return_tensors='pt') for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False): - ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -461,22 +455,22 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] - sudo apt-get install libtbb-dev + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - run: source ./ov/setupvars.sh && convert_tokenizer ./redpajama-3b-chat/ --output ./redpajama-3b-chat/ --with-detokenizer --trust-remote-code - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt + + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " @@ -486,7 +480,7 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat') tokenized = tokenizer('Alan Turing was a', return_tensors='pt') for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False): - ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref}" from predictions') diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml new file mode 100644 index 0000000000..1cb6d5d142 --- /dev/null +++ b/.github/workflows/genai_package.yml @@ -0,0 +1,98 @@ +name: genai_package +on: pull_request +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }} + cancel-in-progress: true +jobs: + ubuntu_genai_package: + strategy: + matrix: + build-type: [Release, Debug] + runs-on: ubuntu-20.04 + env: + CMAKE_BUILD_PARALLEL_LEVEL: null + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: mkdir ./ov/ + - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov + - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ov/samples/cpp/ -B ./samples\ build/ && cmake --build ./samples\ build/ --config ${{ matrix.build-type }} -j && cmake --install ./samples\ build/ --config ${{ matrix.build-type }} --component samples_bin --prefix s\ pace + if: ${{ 'Release' != matrix.build-type }} + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + - run: source ./ov/setupvars.sh && timeout 50s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" + + macos_genai_package: + strategy: + matrix: + build-type: [Release, Debug] + runs-on: macos-12 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: mkdir ./ov/ + - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: brew install coreutils scons + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov + - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && timeout 50s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" + if: ${{ 'Release' == matrix.build-type }} + + windows_genai_package: + strategy: + matrix: + build-type: [Release, Debug] + runs-on: windows-latest + env: + CMAKE_BUILD_PARALLEL_LEVEL: null + defaults: + run: + shell: cmd + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/windows/w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64.zip + - run: unzip ov.zip + # Shorten the next setupvars calls. + - run: mklink /D ov w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64 + - run: call ov\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + - run: call ov\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + - run: call ov\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov + - run: call ov\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install" + if: ${{ false && 'Release' == matrix.build-type }} # build_samples enforces Release build + - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + if: ${{ false && 'Release' == matrix.build-type }} + - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt + if: ${{ false && 'Release' == matrix.build-type }} + - run: call ov\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + if: ${{ false && 'Release' == matrix.build-type }} + - run: call ov\setupvars.bat && "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" + if: ${{ false && 'Release' == matrix.build-type }} diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml new file mode 100644 index 0000000000..4d43f53e8d --- /dev/null +++ b/.github/workflows/genai_python_lib.yml @@ -0,0 +1,85 @@ +name: genai_python_lib +on: pull_request +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }} + cancel-in-progress: true +jobs: + ubuntu_genai_python_lib: + # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env. + runs-on: ubuntu-22.04 + env: + # A tokenizers' dependency fails to compile with Ninja in CenOS7 env. + CMAKE_GENERATOR: Unix Makefiles + CMAKE_BUILD_PARALLEL_LEVEL: null + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: mkdir ./ov/ + # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI. + - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_centos7_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j + # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal + # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager + - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + - run: source ./ov/setupvars.sh && python -m pip install . --config-settings=build-dir="build" --verbose + - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + + macos_genai_python_lib: + runs-on: macos-12 + env: + # A tokenizers' dependency fails to compile with Ninja. + CMAKE_GENERATOR: Unix Makefiles + CMAKE_BUILD_PARALLEL_LEVEL: null + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: mkdir ./ov/ + - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: brew install coreutils scons + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j + # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal + # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager + - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + - run: source ./ov/setupvars.sh && python -m pip install . --config-settings=build-dir="build" --verbose + - run: python -c "from openvino_genai import LLMPipeline" + - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + + windows_genai_python_lib: + if: false + runs-on: windows-latest + env: + CMAKE_BUILD_PARALLEL_LEVEL: null + defaults: + run: + shell: cmd + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/windows/w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64.zip + - run: unzip ov.zip + # Shorten the next setupvars calls. + - run: mklink /D ov w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64 + - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j + - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager + # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. + - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + - run: call ./ov/setupvars.bat && python -m pip install . --config-settings=build-dir="build" --verbose + - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index de06153570..17ad925b59 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -40,15 +40,15 @@ jobs: run: | conda activate openvino_lcm_cpp conda update -c conda-forge --all - conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_lcm_cpp - python -m pip install -r requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + python -m pip install -r requirements.txt - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} @@ -85,15 +85,15 @@ jobs: run: | conda activate openvino_lcm_cpp conda update -c conda-forge --all - conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_lcm_cpp - python -m pip install -r requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + python -m pip install -r requirements.txt - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index 38a2022e1d..db28fad79f 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -39,15 +39,15 @@ jobs: - name: Install OpenVINO and other conda dependencies run: | conda activate openvino_sd_cpp - conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_sd_cpp - python -m pip install -r requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + python -m pip install -r requirements.txt - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} @@ -83,14 +83,14 @@ jobs: - name: Install OpenVINO and other conda dependencies run: | conda activate openvino_sd_cpp - conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_sd_cpp - python -m pip install -r requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + python -m pip install -r requirements.txt - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} diff --git a/.gitignore b/.gitignore index ae479f4faa..2e39ce5394 100644 --- a/.gitignore +++ b/.gitignore @@ -38,4 +38,4 @@ CMakeUserPresets.json # Python-specific *.?env* *.pyc -__pycache__ \ No newline at end of file +__pycache__ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..b8cad76f52 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,44 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +cmake_minimum_required(VERSION 3.23.0) # The requirement comes from Jinja2Cpp + +# Multi config generators such as Visual Studio ignore CMAKE_BUILD_TYPE. Multi config generators are configured with +# CMAKE_CONFIGURATION_TYPES, but limiting options in it completely removes such build options +get_property(GENERATOR_IS_MULTI_CONFIG_VAR GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) +if(CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") + # 'Ninja Multi-Config' specific, see: + # https://cmake.org/cmake/help/latest/variable/CMAKE_DEFAULT_BUILD_TYPE.html + set(CMAKE_DEFAULT_BUILD_TYPE "Release" CACHE STRING "CMake default build type") +elseif(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE) + message(STATUS "CMAKE_BUILD_TYPE is not defined, 'Release' will be used") + # Setting CMAKE_BUILD_TYPE as CACHE must go before project(). Otherwise project() sets its value and set() doesn't take an effect + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...") +endif() + +project(OpenVINOGenAI VERSION 2024.2.0.0) + +add_subdirectory(./thirdparty/) +add_subdirectory(src) +add_subdirectory(samples/cpp/beam_search_causal_lm/) +add_subdirectory(samples/cpp/chat_sample/) +add_subdirectory(samples/cpp/greedy_causal_lm/) +add_subdirectory(samples/cpp/multinomial_causal_lm/) +add_subdirectory(samples/cpp/prompt_lookup_decoding_lm/) +add_subdirectory(samples/cpp/speculative_decoding_lm/) + +install(DIRECTORY + ./samples/cpp/beam_search_causal_lm + ./samples/cpp/chat_sample + ./samples/cpp/greedy_causal_lm + ./samples/cpp/multinomial_causal_lm + # Don't install prompt_lookup_decoding_lm and speculative_decoding_lm because they don't use openvino_genai library and arent verifyed yet. + DESTINATION samples/cpp/ COMPONENT cpp_samples_genai) +install(FILES ./samples/cpp/requirements.txt DESTINATION samples/cpp/ COMPONENT cpp_samples_genai) +install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) +install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) +if(MSVC AND NOT DEFINED CPACK_GENERATOR) + set(CPACK_GENERATOR "ZIP") +endif() +include(CPack) diff --git a/image_generation/lcm_dreamshaper_v7/cpp/README.md b/image_generation/lcm_dreamshaper_v7/cpp/README.md index 6077b8a1c7..a7b19ae4e4 100644 --- a/image_generation/lcm_dreamshaper_v7/cpp/README.md +++ b/image_generation/lcm_dreamshaper_v7/cpp/README.md @@ -18,7 +18,7 @@ Prepare a python environment and install dependencies: conda create -n openvino_lcm_cpp python==3.10 conda activate openvino_lcm_cpp conda update -c conda-forge --all -conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake +conda install -c conda-forge openvino=2024.2.0 c-compiler cxx-compiler git make cmake # Ensure that Conda standard libraries are used conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH ``` diff --git a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt index 047e0d826f..e86e1c2eb1 100644 --- a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt +++ b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt @@ -1,4 +1,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu torch==2.2.2+cpu diffusers==0.27.2 -optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 +optimum-intel[openvino]==1.17.0 diff --git a/image_generation/stable_diffusion_1_5/cpp/README.md b/image_generation/stable_diffusion_1_5/cpp/README.md index fb01326ea5..81ccd0c296 100644 --- a/image_generation/stable_diffusion_1_5/cpp/README.md +++ b/image_generation/stable_diffusion_1_5/cpp/README.md @@ -18,7 +18,7 @@ Prepare a python environment and install dependencies: ```shell conda create -n openvino_sd_cpp python==3.10 conda activate openvino_sd_cpp -conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake +conda install -c conda-forge openvino=2024.2.0 c-compiler cxx-compiler git make cmake # Ensure that Conda standard libraries are used conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH ``` diff --git a/image_generation/stable_diffusion_1_5/cpp/requirements.txt b/image_generation/stable_diffusion_1_5/cpp/requirements.txt index 29b40d70c4..dd5faeb7de 100644 --- a/image_generation/stable_diffusion_1_5/cpp/requirements.txt +++ b/image_generation/stable_diffusion_1_5/cpp/requirements.txt @@ -2,5 +2,5 @@ torch==2.2.2+cpu diffusers==0.27.2 transformers==4.39.3 -optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 +optimum-intel[openvino]==1.17.0 huggingface_hub[cli]==0.22.2 diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index 8441a36e9b..c3a54f4a83 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -188,12 +188,109 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, bench_hook.clear_time_infer_list() +def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_index, streamer, model_precision, proc_id): + set_seed(args['seed']) + input_text_list = [input_text] * args['batch_size'] + if args["output_dir"] is not None and num == 0: + for bs_index, in_text in enumerate(input_text_list): + utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) + tok_encode_start = time.perf_counter() + input_data = tokenizer(input_text_list, return_tensors='pt') + tok_encode_end = time.perf_counter() + tok_encode_time = (tok_encode_end - tok_encode_start) * 1000 + # Remove `token_type_ids` from inputs + input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data + input_token_size = input_tokens[0].numel() + if args['batch_size'] > 1: + out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) + out_str += " Batch_size={}, ".format(args['batch_size']) + out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size']) + if args['infer_count'] is not None: + out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) + log.info(out_str) + + max_rss_mem_consumption = '' + max_shared_mem_consumption = '' + if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + mem_consumption.start_collect_memory_consumption() + max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] + streamer.reset() + start = time.perf_counter() + generated_text = model.generate(input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer) + log.info(generated_text) + end = time.perf_counter() + if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + mem_consumption.end_collect_momory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption = mem_consumption.get_max_memory_consumption() + mem_consumption.clear_max_memory_consumption() + + generation_time = end - start + + result = [streamer.get_tokens()] + tok_decode_start = time.perf_counter() + _ = tokenizer.batch_decode(result) + tok_decode_end = time.perf_counter() + tok_decode_time = (tok_decode_end - tok_decode_start) * 1000 + # Only text_gen need to minus length of input_data, because generated_text may include input_text + num_tokens = 0 + result_md5_list = [] + for bs_idx in range(args['batch_size']): + generated_text_len = len(result[bs_idx]) + num_tokens += generated_text_len + if generated_text_len > max_gen_tokens: + log.error('Output token size is over max output token size!') + result_text = generated_text[bs_idx] + if args["output_dir"] is not None: + utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) + result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest()) + if num == 0: + warmup_md5[prompt_index] = result_md5_list + per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) + tm_list = streamer.get_time_list() + log.debug('latency of all tokens:') + [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + iter_data = gen_iterate_data( + num, + input_token_size * args['batch_size'], + len(tm_list), + num_tokens, + generation_time, + per_token_time, + result_md5_list, + max_rss_mem=max_rss_mem_consumption, + max_shared_mem=max_shared_mem_consumption, + prompt_idx=prompt_index, + tokenization_time=(tok_encode_time, tok_decode_time) + ) + iter_data_list.append(iter_data) + utils.metrics_print.print_metrics( + num, + iter_data, + tm_list, + [], + warm_up=(num == 0), + max_rss_mem=max_rss_mem_consumption, + max_shared_mem=max_shared_mem_consumption, + tokenization_time=(tok_encode_time, tok_decode_time), + batch_size=args['batch_size'] + ) + if num > 0: + warmup_md5_list = warmup_md5[prompt_index] + if result_md5_list != warmup_md5_list: + log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} is different from warm-up's md5 {warmup_md5_list}") + utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + else: + utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + streamer.reset() + + def run_text_generation_benchmark(model_path, framework, device, args, num_iters): - model, tokenizer, pretrain_time, bench_hook = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) + model, tokenizer, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) model_precision = utils.model_utils.get_model_precision(model_path.parts) iter_data_list = [] warmup_md5 = {} input_text_list = utils.model_utils.get_prompts(args) + text_gen_fn = run_text_generation if not use_genai else run_text_generation_genai if len(input_text_list) == 0: raise RuntimeError('==Failure prompts is empty ==') log.info(f"Numbeams: {args['num_beams']}, benchmarking iter nums(exclude warm-up): {num_iters}, " @@ -207,13 +304,13 @@ def run_text_generation_benchmark(model_path, framework, device, args, num_iters for prompt_idx, input_text in enumerate(input_text_list): if num == 0: log.info(f'[warm-up] Input text: {input_text}') - run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_idx, bench_hook, model_precision, proc_id) + text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_idx, bench_hook, model_precision, proc_id) else: for prompt_idx, input_text in enumerate(input_text_list): for num in range(num_iters + 1): if num == 0: log.info(f'[warm-up] Input text: {input_text}') - run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_idx, bench_hook, model_precision, proc_id) + text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_idx, bench_hook, model_precision, proc_id) utils.metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], True) return iter_data_list, pretrain_time @@ -281,6 +378,8 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, def run_image_generation_benchmark(model_path, framework, device, args, num_iters): + if args['genai']: + log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") pipe, pretrain_time = FW_UTILS[framework].create_image_gen_model(model_path, device, **args) iter_data_list = [] input_image_list = utils.model_utils.get_image_param_from_prompt_file(args) @@ -311,6 +410,8 @@ def run_image_generation_benchmark(model_path, framework, device, args, num_iter def run_image_classification(model_path, framework, device, args, num_iters=10): + if args['genai']: + log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") model, input_size = FW_UTILS[framework].create_image_classification_model(model_path, device, **args) data = torch.rand(input_size) @@ -381,6 +482,8 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_iters): + if args["genai"]: + log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") pipe, pretrain_time = FW_UTILS[framework].create_ldm_super_resolution_model(model_path, device, **args) iter_data_list = [] tm_list = [] @@ -511,6 +614,7 @@ def get_argprser(): ) parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files') utils.model_utils.add_stateful_model_arguments(parser) + parser.add_argument("--genai", action="store_true") return parser.parse_args() diff --git a/llm_bench/python/utils/model_utils.py b/llm_bench/python/utils/model_utils.py index 55e58dfea8..cdac5cc7b2 100644 --- a/llm_bench/python/utils/model_utils.py +++ b/llm_bench/python/utils/model_utils.py @@ -135,6 +135,7 @@ def analyze_args(args): model_args['convert_tokenizer'] = args.convert_tokenizer model_args['subsequent'] = args.subsequent model_args['output_dir'] = args.output_dir + model_args['genai'] = args.genai model_framework = args.framework model_path = Path(args.model) diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/utils/ov_utils.py index 3d77941ca9..e76f2486f6 100644 --- a/llm_bench/python/utils/ov_utils.py +++ b/llm_bench/python/utils/ov_utils.py @@ -90,8 +90,12 @@ def build_ov_tokenizer(hf_tokenizer): return hf_tokenizer ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) - ov_compiled_tokenizer = ov.compile_model(ov_tokenizer) - ov_compiled_detokenizer = ov.compile_model(ov_detokenizer) + return build_ov_tokenizer_wrapper(hf_tokenizer, ov_tokenizer, ov_detokenizer) + + +def build_ov_tokenizer_wrapper(hf_tokenizer, tokenizer_model, detokenizer_model): + ov_compiled_tokenizer = ov.compile_model(tokenizer_model) + ov_compiled_detokenizer = ov.compile_model(detokenizer_model) def encode_ov_tokenizer_full(self, text, *args, **kwargs): if isinstance(text, str): @@ -139,6 +143,13 @@ def create_text_gen_model(model_path, device, **kwargs): if not model_path_existed: raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') else: + if kwargs.get("genai", False) and is_genai_available(log_msg=True): + if kwargs["batch_size"] > 1 or kwargs["num_beams"] > 1: + log.warning("OpenVINO GenAI based benchmarking implmented only for batch_size == 1 and num_beams == 1") + elif model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"]]: + log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking") + else: + return create_genai_text_gen_model(model_path, device, ov_config, **kwargs) remote_code = False try: model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=False) @@ -164,7 +175,69 @@ def create_text_gen_model(model_path, device, **kwargs): tokenizer = token_class.from_pretrained(model_path, trust_remote_code=True) if kwargs.get("convert_tokenizer", False): tokenizer = build_ov_tokenizer(tokenizer) - return ov_model, tokenizer, from_pretrained_time, bench_hook + return ov_model, tokenizer, from_pretrained_time, bench_hook, False + + +def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): + import openvino_tokenizers # noqa: F401 + import openvino_genai + from transformers import AutoTokenizer + + class TokenStreamer(openvino_genai.StreamerBase): + def __init__(self, tokenizer): + super().__init__() + self.tokenizer = tokenizer + self.token_generation_time = [] + self.generated_tokens = [] + self.start_time = time.perf_counter() + + def put(self, token_id): + self.token_generation_time.append(time.perf_counter() - self.start_time) + self.generated_tokens.append(token_id) + self.start_time = time.perf_counter() + return False + + def reset(self): + self.token_generation_time = [] + self.generated_tokens = [] + self.start_time = time.perf_counter() + + def end(self): + pass + + def get_tokens(self): + return self.generated_tokens + + def get_time_list(self): + return self.token_generation_time + + if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists(): + convert_ov_tokenizer(model_path) + + core = Core() + hf_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + ov_tok = core.read_model(model_path / "openvino_tokenizer.xml") + ov_detok = core.read_model(model_path / "openvino_detokenizer.xml") + hf_tokenizer = build_ov_tokenizer_wrapper(hf_tokenizer, ov_tok, ov_detok) + + start = time.perf_counter() + + # TO DO: add plugin config + llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper()) + end = time.perf_counter() + log.info(f'Pipeline initialization time: {end - start:.2f}s') + streamer = TokenStreamer(llm_pipe.get_tokenizer()) + + return llm_pipe, hf_tokenizer, end - start, streamer, True + + +def convert_ov_tokenizer(tokenizer_path): + from optimum.exporters.openvino.convert import export_tokenizer + from transformers import AutoTokenizer + + hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) + + export_tokenizer(hf_tokenizer, tokenizer_path) def create_image_gen_model(model_path, device, **kwargs): @@ -198,3 +271,15 @@ def create_ldm_super_resolution_model(model_path, device, **kwargs): from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') return ov_model, from_pretrained_time + + +def is_genai_available(log_msg=False): + import importlib + try: + importlib.import_module('openvino_genai') + except ImportError as ex: + if log_msg: + log.warning("Attempt to load OpenVINO GenaAI package failed. Please install openvino_genai package. Full error message available in debug mode") + log.debug(ex) + return False + return True diff --git a/llm_bench/python/utils/pt_utils.py b/llm_bench/python/utils/pt_utils.py index ccf401330c..873468c1ea 100644 --- a/llm_bench/python/utils/pt_utils.py +++ b/llm_bench/python/utils/pt_utils.py @@ -97,7 +97,7 @@ def create_text_gen_model(model_path, device, **kwargs): backend = kwargs['torch_compile_backend'] compiled_model = run_torch_compile(model, backend) model = compiled_model - return model, tokenizer, from_pretrain_time, bench_hook + return model, tokenizer, from_pretrain_time, bench_hook, False def create_image_gen_model(model_path, device, **kwargs): diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..7656a64778 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +[project] +name = "openvino_genai" +version = "2024.2.0.0" +description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai" +requires-python = ">=3.8" +readme = {file = "src/README.md", content-type="text/markdown"} +license = {text = "OSI Approved :: Apache Software License"} +authors = [ + { name = "OpenVINO Developers", email = "openvino@intel.com" }, +] +classifiers = [ + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "openvino_tokenizers~=2024.2.0.0" +] + +[tool.scikit-build] +cmake.build-type = "Release" +cmake.source-dir = "./" +cmake.targets = ["py_generate_pipeline"] # Adding genai would trigger a Release build and Debug build after it. py_generate_pipeline depends on genai and genai will be built anyway. It's not been investigated why both build types are triggered. +cmake.version = ">=3.23" +install.components = ["wheel_genai"] +sdist.cmake = true +wheel.packages = ["src/python/openvino_genai"] +wheel.install-dir = "openvino_genai" +wheel.build-tag = "000" +wheel.license-files = ["LICENSE", "SECURITY.md", "third-party-programs.txt"] + +[[tool.scikit-build.generate]] +path = "openvino_genai/__version__.py" +template = ''' +__version__ = "${version}" +''' + +[build-system] +# TODO: add build.tool-args = ["--parallel"] after scikit-build-core is updated to 0.9.4+. +requires = ["scikit-build-core~=0.8.0"] # See https://github.com/openvinotoolkit/openvino_tokenizers/pull/123 +build-backend = "scikit_build_core.build" + +[tool.pytest.ini_options] +markers = [ + "nightly", + "precommit: (deselect with '-m \"precommit\"')", +] diff --git a/requirements-build.txt b/requirements-build.txt new file mode 100644 index 0000000000..81be222a8b --- /dev/null +++ b/requirements-build.txt @@ -0,0 +1,2 @@ +build~=1.2.1 +cmake~=3.23 diff --git a/samples/cpp/beam_search_causal_lm/CMakeLists.txt b/samples/cpp/beam_search_causal_lm/CMakeLists.txt new file mode 100644 index 0000000000..9728eee3b3 --- /dev/null +++ b/samples/cpp/beam_search_causal_lm/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) +add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) +target_link_libraries(beam_search_causal_lm PRIVATE openvino::genai) +target_compile_features(beam_search_causal_lm PRIVATE cxx_std_17) +install(TARGETS beam_search_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md new file mode 100644 index 0000000000..0f73f1f931 --- /dev/null +++ b/samples/cpp/beam_search_causal_lm/README.md @@ -0,0 +1,22 @@ +# Text generation C++ sample that supports most popular models like LLaMA 2 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +```sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`beam_search_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp new file mode 100644 index 0000000000..f4159399aa --- /dev/null +++ b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include + +int main(int argc, char* argv[]) try { + if (argc < 3) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " '' ['' ...]"); + } + auto prompts = std::vector(argv + 2, argv + argc); + + std::string model_path = argv[1]; + std::string device = "CPU"; // GPU can be used as well + + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 20; + config.num_beam_groups = 3; + config.num_beams = 15; + config.num_return_sequences = config.num_beams * prompts.size(); + + auto beams = pipe.generate(prompts, config); + for (int i = 0; i < beams.scores.size(); i++) + std::cout << beams.scores[i] << ": " << beams.texts[i] << '\n'; + + return 0; +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/samples/cpp/chat_sample/CMakeLists.txt b/samples/cpp/chat_sample/CMakeLists.txt new file mode 100644 index 0000000000..347ff43823 --- /dev/null +++ b/samples/cpp/chat_sample/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) +add_executable(chat_sample chat_sample.cpp) +target_link_libraries(chat_sample PRIVATE openvino::genai) +target_compile_features(chat_sample PRIVATE cxx_std_17) +install(TARGETS chat_sample + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md new file mode 100644 index 0000000000..d560239b92 --- /dev/null +++ b/samples/cpp/chat_sample/README.md @@ -0,0 +1,22 @@ +# C++ chat_sample that supports most popular models like LLaMA 2 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +```sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run: + +`chat_sample TinyLlama-1.1B-Chat-v1.0` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp new file mode 100644 index 0000000000..75cf609afe --- /dev/null +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -0,0 +1,36 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" + +int main(int argc, char* argv[]) try { + std::string prompt; + std::string accumulated_str = ""; + + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 10000; + std::function streamer = [](std::string word) { std::cout << word << std::flush; return false; }; + + pipe.start_chat(); + for (;;) { + std::cout << "question:\n"; + + std::getline(std::cin, prompt); + if (prompt == "Stop!") + break; + + pipe.generate(prompt, config, streamer); + + std::cout << "\n----------\n"; + } + pipe.finish_chat(); +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/samples/cpp/greedy_causal_lm/CMakeLists.txt b/samples/cpp/greedy_causal_lm/CMakeLists.txt new file mode 100644 index 0000000000..c492036088 --- /dev/null +++ b/samples/cpp/greedy_causal_lm/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) +add_executable(greedy_causal_lm greedy_causal_lm.cpp) +target_link_libraries(greedy_causal_lm PRIVATE openvino::genai) +target_compile_features(greedy_causal_lm PRIVATE cxx_std_17) +install(TARGETS greedy_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md new file mode 100644 index 0000000000..fa23d35b0a --- /dev/null +++ b/samples/cpp/greedy_causal_lm/README.md @@ -0,0 +1,22 @@ +# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 2 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +```sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. diff --git a/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp b/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp new file mode 100644 index 0000000000..dd309af8f9 --- /dev/null +++ b/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" + +int main(int argc, char* argv[]) try { + if (3 > argc || argc > 4) + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\" "); + + std::string model_path = argv[1]; + std::string prompt = argv[2]; + + // GPU can be used as well + std::string device = "CPU"; + if (argc > 3) device = argv[3]; + + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 100; + config.do_sample = false; + auto streamer = [](std::string subword){ std::cout << subword << std::flush; return false; }; + + // since streamer is set results will be printed each time a new token is generated + pipe.generate(prompt, config, streamer); +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt new file mode 100644 index 0000000000..1d79af25d2 --- /dev/null +++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) +add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) +target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai) +target_compile_features(greedy_causal_lm PRIVATE cxx_std_17) +install(TARGETS multinomial_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md new file mode 100644 index 0000000000..e43320f6a1 --- /dev/null +++ b/samples/cpp/multinomial_causal_lm/README.md @@ -0,0 +1,22 @@ +# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 2 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +```sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`multinomial_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. diff --git a/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp b/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp new file mode 100644 index 0000000000..6cbab7d2f5 --- /dev/null +++ b/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" + +int main(int argc, char* argv[]) try { + if (3 > argc || argc > 4) + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\" "); + + std::string model_path = argv[1]; + std::string prompt = argv[2]; + + // GPU can be used as well + std::string device = "CPU"; + if (argc > 3) { + device = argv[3]; + } + + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 100; + config.do_sample = true; + config.top_p = 0.9; + config.top_k = 30; + auto streamer = [](std::string subword) { + std::cout << subword << std::flush; + return false; + }; + + // since streamer is set results will be printed each time a new token is generated + pipe.generate(prompt, config, streamer); +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt new file mode 100644 index 0000000000..1fff62a1aa --- /dev/null +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -0,0 +1,18 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +if(TARGET openvino_tokenizers) + set(OPENVINO_TOKENIZERS_PATH $) +else() + message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.") +endif() +find_package(OpenVINO REQUIRED COMPONENTS Runtime) +find_package(TBB REQUIRED COMPONENTS tbb) +add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) +target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb) +target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") +target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17) +install(TARGETS prompt_lookup_decoding_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md new file mode 100644 index 0000000000..3d01a64420 --- /dev/null +++ b/samples/cpp/prompt_lookup_decoding_lm/README.md @@ -0,0 +1,25 @@ +# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 2 + +[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +```sh +source /setupvars.sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. diff --git a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp similarity index 99% rename from text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp rename to samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 5060b88642..cd6de37753 100644 --- a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -1,6 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include #include #include diff --git a/text_generation/causal_lm/cpp/requirements.txt b/samples/cpp/requirements.txt similarity index 56% rename from text_generation/causal_lm/cpp/requirements.txt rename to samples/cpp/requirements.txt index 9ea792f7b0..ae5e9b7e9c 100644 --- a/text_generation/causal_lm/cpp/requirements.txt +++ b/samples/cpp/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum[openvino]==1.20.0 -optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt new file mode 100644 index 0000000000..e18ffec97b --- /dev/null +++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt @@ -0,0 +1,18 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +if(TARGET openvino_tokenizers) + set(OPENVINO_TOKENIZERS_PATH $) +else() + message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.") +endif() +find_package(OpenVINO REQUIRED COMPONENTS Runtime) +find_package(TBB REQUIRED COMPONENTS tbb) +add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) +target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb) +target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") +target_compile_features(speculative_decoding_lm PRIVATE cxx_std_17) +install(TARGETS speculative_decoding_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md new file mode 100644 index 0000000000..c0fd9bc4c5 --- /dev/null +++ b/samples/cpp/speculative_decoding_lm/README.md @@ -0,0 +1,29 @@ +# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 2 + +Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alonside with the main model. + +Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. + +This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In tha caste the are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +```sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +optimum-cli export openvino --trust-remote-code --model meta-llama/Llama-2-7b-chat-hf Llama-2-7b-chat-hf +``` + +## Run + +`speculative_decoding_lm TinyLlama-1.1B-Chat-v1.0 Llama-2-7b-chat-hf "Why is the Sun yellow?"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. diff --git a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp similarity index 100% rename from text_generation/causal_lm/cpp/speculative_decoding_lm.cpp rename to samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000..d154836878 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,13 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +# Find OpenVINODeveloperPackage first to compile with SDL flags +find_package(OpenVINODeveloperPackage QUIET + PATHS "${OpenVINO_DIR}") +if(NOT OpenVINODeveloperPackage_FOUND) + find_package(OpenVINO REQUIRED COMPONENTS Runtime) +endif() + +add_subdirectory(cpp) +add_subdirectory(python) diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000000..db48793427 --- /dev/null +++ b/src/README.md @@ -0,0 +1,246 @@ +# OpenVINO Generate API + +## Usage + +First of all you need to convert your model with optimum-cli +``` sh +optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" +pip install openvino-genai +``` + +`LLMPipeline` is the main object used for decoding. You can construct it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration. + +### Python + +A minimalist example: +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +print(pipe.generate("The Sun is yellow bacause")) +``` + +Calling generate with custom generation config parameters, e.g. config for grouped beam search +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") + +result = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5, diversity_penalty=1.5) +print(result) +``` + +output: +``` +'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in' +``` + +A simples chat in python: +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path) + +config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.5} +pipe.set_generation_config(config) + +pipe.start_chat() +while True: +    print('question:') +    prompt = input() + if prompt == 'Stop!': +        break +    print(pipe(prompt)) +pipe.finish_chat() +``` + +Test to compare with Huggingface outputs + +### C++ + +Minimalistc example +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow bacause"); +} +``` + +Using Group Beam Search Decoding +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 256; + config.num_groups = 3; + config.group_size = 5; + config.diversity_penalty = 1.0f; + + std::cout << pipe.generate("The Sun is yellow bacause", config); +} +``` + +A simple chat in C++ using grouped beam search decoding +``` cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string prompt; + + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 256; + config.num_groups = 3; + config.group_size = 5; + config.diversity_penalty = 1.0f; + + pipe.start_chat(); + for (;;;) { + std::cout << "question:\n"; + std::getline(std::cin, prompt); + if (prompt == "Stop!") + break; + + std::cout << "answer:\n"; + auto answer = pipe(prompt, config); + std::cout << answer << std::endl; + } + pipe.finish_chat(); +} +``` + +Streaming example with lambda function +``` cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + auto streamer = [](std::string word) { std::cout << word << std::flush; }; + std::cout << pipe.generate("The Sun is yellow bacause", streamer); +} +``` + +Streaming with a custom class +``` cpp +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include + +class CustomStreamer: public ov::genai::StreamerBase { +public: + bool put(int64_t token) { + bool stop_flag = false; + /* custom decoding/tokens processing code + tokens_cache.push_back(token); + std::string text = m_tokenizer.decode(tokens_cache); + ... + */ + return stop_flag; + }; + + void end() { + /* custom finalization */ + }; +}; + +int main(int argc, char* argv[]) { + CustomStreamer custom_streamer; + + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow bacause", custom_streamer); +} +``` + +## How it works + +### Stateful LLM + +A common LLM inference optimisation is introduction of past KV (key/value)-cache. This cache is represented by the corresponding inputs and outputs in a model implemented originally in DL framework (e.g. PyTorch models from HuggingFace). To optimize it further and simplify usage, the model is transformed to a stateful form. This transformation improves inference performance and decreases amount of allocated runtime memory in long running text generation scenarios. It is achieved by hiding inputs and outputs of the model that represent past KV-cache tensors and handling them inside the model in a more efficient way. Although the cache is still accessible with state API. It is opposed to stateless model approach requiring manipulating these inputs and outputs explicitly. An introduction to stateful models can be found in https://docs.openvino.ai/2023.3/openvino_docs_OV_UG_stateful_models_intro.html. + +Hiding KV-cache introduces a peculiarity for beam search algorithm. Beam search suggests batched inference of multiple beams. The design described here so far would result in generating multiple independent sequences of tokens. Beam search algorithm, on the other hand, requires removing some of the ongoing beams and splitting other beams to multiple branches. Beam removal requires deleting corresponding KV-cache entry and beam splitting requires copying corresponding KV-cache values. + +To provide the possibility to implement beam search without accessing model's internal state, a stateful LLM converted with `optimum-intel` or [llm_bench](../../../llm_bench/python/) introduces an additional 1-dimentional `beam_idx` input. `beam_idx` must contain indexes of elements in a batch which are intended to be selected and will evolve during the next beam search iteration. There's only one beam when the generation starts. That beam corresponds to the initial prompt. `beam_idx` must have values: `[0, 0]` to keep the initial beam and introduce its copy. The dynamic batch size enables to change the number of beams dynamically. `beam_idx` must have `[1]` as the value to remove zeroth sequence and keep the second beam only. + +Assume there are two running beams. To proceed with generating both beams at the next iteration, `beam_idx` values must be `[0, 1]`, pointing to batch elements `0` and `1`. To drop the last beam and split the other beam in two, `beam_idx` must be set to `[0, 0]`. This results in utilizing only the part of KV cache corresponding to the zeroth element in the batch. The process of selecting proper entries in cache is called Cache Reorder. + +![](beam_idx-fork.gif) +![](beam_idx-drop.gif) + +The images below represent stateless and stateful LLM pipelines. The model has 4 inputs: +1. `input_ids` contains the next selected token +2. `attention_mask` is filled with `1` +3. `position_ids` encodes a position of currently generating token in the sequence +4. `beam_idx` selects beams + +The model has 1 output `logits` describing the predicted distribution over the next tokens. And there's KV cache state. + +![](stateless.jpg) +![](stateful.jpg) + +## Supported models + +1. chatglm + 1. https://huggingface.co/THUDM/chatglm2-6b - refer to + [chatglm2-6b - AttributeError: can't set attribute](../../../llm_bench/python/doc/NOTES.md#chatglm2-6b---attributeerror-cant-set-attribute) + in case of `AttributeError` + 2. https://huggingface.co/THUDM/chatglm3-6b +2. LLaMA 2 (requires access request submission on its Hugging Face page to be downloaded) + 1. https://huggingface.co/meta-llama/Llama-2-13b-chat-hf + 2. https://huggingface.co/meta-llama/Llama-2-13b-hf + 3. https://huggingface.co/meta-llama/Llama-2-7b-chat-hf + 4. https://huggingface.co/meta-llama/Llama-2-7b-hf + 5. https://huggingface.co/meta-llama/Llama-2-70b-chat-hf + 6. https://huggingface.co/meta-llama/Llama-2-70b-hf +3. [Llama2-7b-WhoIsHarryPotter](https://huggingface.co/microsoft/Llama2-7b-WhoIsHarryPotter) +4. OpenLLaMA + 1. https://huggingface.co/openlm-research/open_llama_13b + 2. https://huggingface.co/openlm-research/open_llama_3b + 3. https://huggingface.co/openlm-research/open_llama_3b_v2 + 4. https://huggingface.co/openlm-research/open_llama_7b + 5. https://huggingface.co/openlm-research/open_llama_7b_v2 +5. [TinyLlama](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) +6. Qwen + 1. https://huggingface.co/Qwen/Qwen-7B-Chat + 2. https://huggingface.co/Qwen/Qwen-7B-Chat-Int4 - refer to + 3. https://huggingface.co/Qwen/Qwen1.5-7B-Chat + 4. https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4 + [Qwen-7B-Chat-Int4 - Torch not compiled with CUDA enabled](../../../llm_bench/python/doc/NOTES.md#qwen-7b-chat-int4---torch-not-compiled-with-cuda-enabled) + in case of `AssertionError` +7. Dolly + 1. https://huggingface.co/databricks/dolly-v2-3b +8. Phi + 1. https://huggingface.co/microsoft/phi-2 + 2. https://huggingface.co/microsoft/phi-1_5 +9. [notus-7b-v1](https://huggingface.co/argilla/notus-7b-v1) +10. [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) +11. [redpajama-3b-chat](https://huggingface.co/ikala/redpajama-3b-chat) +12. [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) +13. [Gemma-2B-it](https://huggingface.co/google/gemma-2b-it) + +This pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. The model is required heve after the conversion the following inputs: +1. `input_ids` contains the tokens +2. `attention_mask` is filled with `1` +3. `beam_idx` selects beams +4. `position_ids` (optional) encodes a position of currently generating token in the sequence + +and a single `logits` output. + +Some models may require access request submission on their Hugging Face page to be downloaded. + +If https://huggingface.co/ is down, the conversion step won't be able to download the models. + +> [!NOTE] +> Models should belong to the same family and have same tokenizers. diff --git a/text_generation/causal_lm/cpp/beam_idx-drop.gif b/src/beam_idx-drop.gif similarity index 100% rename from text_generation/causal_lm/cpp/beam_idx-drop.gif rename to src/beam_idx-drop.gif diff --git a/text_generation/causal_lm/cpp/beam_idx-fork.gif b/src/beam_idx-fork.gif similarity index 100% rename from text_generation/causal_lm/cpp/beam_idx-fork.gif rename to src/beam_idx-fork.gif diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt new file mode 100644 index 0000000000..399ce29084 --- /dev/null +++ b/src/cpp/CMakeLists.txt @@ -0,0 +1,103 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +# Dependencies + +include(FetchContent) + +FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) +FetchContent_MakeAvailable(nlohmann_json) + +function(ov_genai_build_jinja2cpp) + FetchContent_Declare(jinja2cpp + URL https://github.com/jinja2cpp/Jinja2Cpp/archive/9ae7e1fc45d707e1686dd425a154d30963801944.tar.gz + URL_HASH SHA256=aa41ae425225623ba91be5de3ef1e0d942e682d519311e6235b04b4e7d880e01) + + FetchContent_GetProperties(jinja2cpp) + if(NOT jinja2cpp_POPULATED) + FetchContent_Populate(jinja2cpp) + + set(BUILD_SHARED_LIBS OFF) + set(JINJA2CPP_INSTALL OFF CACHE BOOL "") + set(JINJA2CPP_CXX_STANDARD 17 CACHE STRING "") + set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "") + set(JINJA2CPP_USE_REGEX "std" CACHE STRING "") + set(JINJA2CPP_WITH_JSON_BINDINGS "none" CACHE STRING "") + set(JINJA2CPP_STRICT_WARNINGS OFF CACHE BOOL "") + set(JINJA2CPP_PIC ON CACHE BOOL "") + + add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL) + endif() +endfunction() + +ov_genai_build_jinja2cpp() + +# Library + +file(GLOB SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp") + +set(TARGET_NAME openvino_genai) +add_library(${TARGET_NAME} SHARED ${SOURCE_FILES}) +add_library(openvino::genai ALIAS ${TARGET_NAME}) + +target_include_directories(${TARGET_NAME} + PUBLIC "$" "$") + +target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp) + +target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17) + +# Extract two last digits from CMAKE_PROJECT_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols. +string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${CMAKE_PROJECT_VERSION_MAJOR}) +set_target_properties(${TARGET_NAME} PROPERTIES + EXPORT_NAME genai + VERSION ${CMAKE_PROJECT_VERSION} + SOVERSION ${MAJOR_SUFFIX}${CMAKE_PROJECT_VERSION_MINOR}${CMAKE_PROJECT_VERSION_PATCH} + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" +) + +find_package(Python3 REQUIRED COMPONENTS Interpreter Development) +install(TARGETS ${TARGET_NAME} + LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR} + RUNTIME DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) + +# - Windows: `\runtime\bin\intel64\Release\` +# - MacOS_x86: `/runtime/lib/intel64/Release` +# - MacOS_arm64: `/runtime/lib/arm64/Release/` +# - Linux_x86: `/runtime/lib/intel64/` +# - Linux_arm64: `/runtime/lib/aarch64/` +string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH_DIR) +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(ARCH_DIR intel64) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") + if(APPLE) + set(ARCH_DIR "arm64") + else() + set(ARCH_DIR "aarch64") + endif() +elseif(ARCH_DIR STREQUAL "x86_64" OR ARCH_DIR STREQUAL "amd64" # Windows detects Intel's 64-bit CPU as AMD64 + OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") + set(ARCH_DIR intel64) +endif() +if(MSVC OR APPLE) + set(ARCH_DIR ${ARCH_DIR}/${CMAKE_BUILD_TYPE}) +endif() +install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets + LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai + NAMELINK_COMPONENT core_genai_dev + ARCHIVE DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev + RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai + INCLUDES DESTINATION runtime/include) +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION runtime/include COMPONENT core_genai_dev) +install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake NAMESPACE openvino:: DESTINATION runtime/cmake) +include(CMakePackageConfigHelpers) +configure_package_config_file(OpenVINOGenAIConfig.cmake.in "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" INSTALL_DESTINATION runtime/cmake) +install(FILES "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" DESTINATION runtime/cmake COMPONENT core_genai_dev) +include(CMakePackageConfigHelpers) +write_basic_package_version_file("${CMAKE_BINARY_DIR}/OpenVINOGenAIConfigVersion.cmake" VERSION ${CMAKE_PROJECT_VERSION} COMPATIBILITY AnyNewerVersion) +export(EXPORT OpenVINOGenAITargets FILE "${CMAKE_BINARY_DIR}/OpenVINOGenAITargets.cmake" NAMESPACE openvino::) diff --git a/src/cpp/OpenVINOGenAIConfig.cmake.in b/src/cpp/OpenVINOGenAIConfig.cmake.in new file mode 100644 index 0000000000..c1f9c86c52 --- /dev/null +++ b/src/cpp/OpenVINOGenAIConfig.cmake.in @@ -0,0 +1,10 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +find_dependency(OpenVINO COMPONENTS Runtime) + +if(NOT TARGET openvino_genai) + include("${CMAKE_CURRENT_LIST_DIR}/OpenVINOGenAITargets.cmake") +endif() + +check_required_components(OpenVINOGenAI) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp new file mode 100644 index 0000000000..f3355f252a --- /dev/null +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -0,0 +1,125 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +#include "openvino/runtime/compiled_model.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/genai/tokenizer.hpp" + +namespace ov { +namespace genai { + +/** + * @brief controls the stopping condition for grouped beam search. The following values are possible: + * "EARLY" stops as soon as there are `num_beams` complete candidates. + "HEURISTIC" stops when is it unlikely to find better candidates. + "NEVER" stops when there cannot be better candidates. + */ +enum class StopCriteria { EARLY, HEURISTIC, NEVER }; + +/** + * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + * and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + * be used while greedy and beam search parameters will not affect decoding at all. + * + * Generic parameters: + * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + * `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + * @param ignore_eos if set to true, then generation will not stop even if token is met. + * @param eos_token_id token_id of (end of sentence) + * + * Beam search specific parameters: + * @param num_beams number of beams for beam search. 1 disables beam search. + * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a + * particular time. See https://arxiv.org/pdf/1909.05858. + * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + * the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + * likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while + * `length_penalty` < 0.0 encourages shorter sequences. + * @param num_return_sequences the number of sequences to return for grouped beam search decoding. + * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once. + * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: + * "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an + * "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + * "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + * + * Random sampling parameters: + * @param temperature the value used to modulate token probabilities for random sampling. + * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. + * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. + */ +class OPENVINO_GENAI_EXPORTS GenerationConfig { +public: + GenerationConfig() = default; + explicit GenerationConfig(const std::string& json_path); + + // Generic + size_t max_new_tokens = SIZE_MAX; + size_t max_length = SIZE_MAX; + bool ignore_eos = false; + + // Beam search specific + size_t num_beam_groups = 1; + size_t num_beams = 1; + float diversity_penalty = 1.0f; + float length_penalty = 1.0f; + size_t num_return_sequences = 1; + size_t no_repeat_ngram_size = std::numeric_limits::max(); + StopCriteria stop_criteria = StopCriteria::HEURISTIC; + + // Multinomial + float temperature = 1.0f; + float top_p = 1.0f; + size_t top_k = 50; + bool do_sample = false; + float repetition_penalty = 1.0f; + + // EOS special token + int64_t eos_token_id = -1; + + size_t get_max_new_tokens(size_t prompt_length = 0) const; + bool is_greedy_decoding() const; + bool is_beam_search() const; + bool is_multinomial() const; + void update_generation_config(const ov::AnyMap& config_map = {}); + + /// @brief checks that are no conflicting parameters, e.g. do_sample=true and num_beams > 1. + /// @throws Exception if config is invalid. + void validate() const; +}; + +/* + * utils that allow to use generate and operator() in the following way: + * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...) + * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...) +*/ +static constexpr ov::Property max_new_tokens{"max_new_tokens"}; +static constexpr ov::Property max_length{"max_length"}; +static constexpr ov::Property ignore_eos{"ignore_eos"}; + +static constexpr ov::Property num_beam_groups{"num_beam_groups"}; +static constexpr ov::Property num_beams{"num_beams"}; +static constexpr ov::Property diversity_penalty{"diversity_penalty"}; +static constexpr ov::Property length_penalty{"length_penalty"}; +static constexpr ov::Property num_return_sequences{"num_return_sequences"}; +static constexpr ov::Property no_repeat_ngram_size{"no_repeat_ngram_size"}; +static constexpr ov::Property stop_criteria{"stop_criteria"}; + +static constexpr ov::Property temperature{"temperature"}; +static constexpr ov::Property top_p{"top_p"}; +static constexpr ov::Property top_k{"top_k"}; +static constexpr ov::Property do_sample{"do_sample"}; +static constexpr ov::Property repetition_penalty{"repetition_penalty"}; +static constexpr ov::Property eos_token_id{"eos_token_id"}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp new file mode 100644 index 0000000000..2ffdfb7b8e --- /dev/null +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -0,0 +1,214 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include "openvino/core/any.hpp" +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/streamer_base.hpp" + +namespace ov { +namespace genai { + +using StreamerVariant = std::variant, std::shared_ptr, std::monostate>; +using OptionalGenerationConfig = std::optional; +using EncodedInputs = std::variant; +using StringInputs = std::variant>; + +/** +* @brief Structure to store resulting batched tokens and scores for each batch sequence +* +* @param tokens sequence of resulting tokens +* @param scores scores for each sequence +*/ +class EncodedResults { +public: + std::vector> tokens; + std::vector scores; +}; + +/** +* @brief Structure to store resulting batched text outputs and scores for each batch +* +* @param texts vector of resulting sequences +* @param scores scores for each sequence +*/ +class DecodedResults { +public: + std::vector texts; + std::vector scores; + + // @brief Convert DecodedResults to a vector of strings. + // @return A std::vector containing the texts from the DecodedResults object. + operator std::string() const { + OPENVINO_ASSERT(texts.size() == 1, "DecodedResults can be converted to string only if contains a single prompt"); + return texts.at(0); + } + + // @brief Convert DecodedResults to a single string. + // @return std::string containing the texts from the DecodedResults object. + operator std::vector() const { + return texts; + } + + // @brief Overloads operator<< to enhance output the contents of DecodedResults. + // @return A reference to the output stream with the concatenated texts. + friend std::ostream& operator<<(std::ostream& os, const DecodedResults& dr) { + for (size_t i = 0; i < dr.texts.size(); ++i) { + os << dr.texts[i]; + if (i != dr.texts.size() - 1) { + os << std::endl; + } + } + return os; + } +}; + +/** +* @brief This class is used for generation with LLMs. + */ +class OPENVINO_GENAI_EXPORTS LLMPipeline { +public: + /** + * @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir. + * + * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json + * @param device optional device + * @param plugin_config optional plugin_config + */ + LLMPipeline( + const std::string& path, + const std::string& device="CPU", + const ov::AnyMap& plugin_config={} + ); + + /** + * @brief Constructs an LLMPipeline from already existing infer InferRequest and Tokenizer + * + * @param request infer request of the model + * @param tokenizer initialized Tokenizer + * @param generation_config optional generation_config, be default will be initialized for greedy decoding + */ + LLMPipeline( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, + OptionalGenerationConfig generation_config=std::nullopt + ); + + /** + * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs. + * + * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json + * @param tokenizer manually initialized ov::Tokenizer + * @param device optional device + * @param plugin_config optional plugin_config + */ + LLMPipeline( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device="CPU", + const ov::AnyMap& plugin_config = {} + ); + + ~LLMPipeline(); + + /** + * @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output. + * + * @param inputs input prompt or a vector of prompts + * @param generation_config optional GenerationConfig + * @param streamer optional streamer + * @return DecodedResults decoded resulting text + */ + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config=std::nullopt, + StreamerVariant streamer=std::monostate() + ); + + /** + * @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output. + * properties can be in any order pipe.generate(..., ov::genai::max_new_tokens(100), ov::genai::streamer(lambda_func)). + * + * @param inputs input prompt or a vector of prompts + * @param properties properties + * @return DecodedResults decoded resulting text + */ + template + util::EnableIfAllStringAny generate( + StringInputs inputs, + Properties&&... properties) { + return generate(inputs, AnyMap{std::forward(properties)...}); + } + DecodedResults generate(StringInputs inputs, const ov::AnyMap& config_map); + + + DecodedResults operator()( + StringInputs inputs, + OptionalGenerationConfig generation_config=std::nullopt, + StreamerVariant streamer=std::monostate() + ) { + return generate(inputs, generation_config, streamer); + } + + template + util::EnableIfAllStringAny operator()( + StringInputs inputs, + Properties&&... properties) { + return generate(inputs, AnyMap{std::forward(properties)...}); + } + + /** + * @brief Low level generate to be called with already encoded input_ids tokens. + * Streamer cannot be used for multibatch inputs. + * + * @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens + * @param generation_config optional GenerationConfig + * @param streamer optional streamer + * @return EncodedResults a structure with resulting tokens and scores + * @throws Exception if the stremaer is set for inputs_ids with multiple batches + */ + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config=std::nullopt, + StreamerVariant streamer=std::monostate() + ); + + /** + * @brief Low level generate to be called with already encoded input_ids tokens. + * Streamer cannot be used for multibatch inputs. + * + * @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens + * @param generation config params + * @return EncodedResults a structure with resulting tokens and scores + * @throws Exception if the stremaer is set for inputs_ids with multiple batches + */ + template + util::EnableIfAllStringAny generate( + const EncodedInputs& inputs, + Properties&&... properties) { + return generate(inputs, AnyMap{std::forward(properties)...}); + } + EncodedResults generate(const EncodedInputs& inputs, const ov::AnyMap& config_map); + + ov::genai::Tokenizer get_tokenizer(); + GenerationConfig get_generation_config() const; + void set_generation_config(const GenerationConfig& config); + + void start_chat(); + void finish_chat(); + std::string apply_chat_template(std::string prompt, std::string role = "user") const; +private: + class LLMPipelineImpl; + std::unique_ptr m_pimpl; +}; + +std::pair streamer(StreamerVariant func); +std::pair generation_config(const GenerationConfig& config); + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp new file mode 100644 index 0000000000..04d350cc5d --- /dev/null +++ b/src/cpp/include/openvino/genai/streamer_base.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/tokenizer.hpp" + +namespace ov { +namespace genai { + +/** + * @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods + * + * @param m_tokenizer tokenizer +*/ +class StreamerBase { +public: + /// @brief put is called every time new token is decoded, + /// @return bool flag to indicate whether generation should be stoped, if return true generation stops + virtual bool put(int64_t token) = 0; + + /// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one + virtual void end() = 0; +}; + + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp new file mode 100644 index 0000000000..1b95f81b29 --- /dev/null +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -0,0 +1,109 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include +#include "openvino/genai/visibility.hpp" + +namespace ov { +namespace genai { + +struct TokenizedInputs { + ov::Tensor input_ids; + ov::Tensor attention_mask; +}; + +/** +* @brief class is used to encode prompts and decode resulting tokens +*/ +class OPENVINO_GENAI_EXPORTS Tokenizer { +public: + /** + * @brief ov::Tokenizer constructor. + * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path + */ + Tokenizer(const std::string& tokenizer_path); + + /** + * @brief encode a single prompt + * @return pair of [input_ids, attention_mask] + */ + TokenizedInputs encode(const std::string prompt); + + /** + * @brief encode batch of prompts. Left padding will be applied by default + * @param prompts vector storing batch of prompts + * @return pair of [input_ids, attention_mask] + */ + TokenizedInputs encode(std::vector& prompts); + TokenizedInputs encode(std::vector&& prompts); + TokenizedInputs encode(std::initializer_list& prompts); + + /** + * @brief decode sequence of tokens + * @param tokens vector storing tokens + * @return sequence string + */ + std::string decode(std::vector tokens); + + /** + * @brief decode tokens. + * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] + * @return vector of std::string, with size = batch_size + */ + std::vector decode(ov::Tensor tokens); + + /** + * @brief batched decoding of tokens. + * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size + * @return vector of std::string, with size equal to batch_size + */ + std::vector decode(std::vector> tokens); + + // information about , tokens should be public, + // they are used at least in StreamerBase descendants + int64_t get_bos_token_id() const; + int64_t get_eos_token_id() const; + int64_t get_pad_token_id() const; + + std::string get_bos_token() const; + std::string get_eos_token() const; + std::string get_pad_token() const; + + Tokenizer() = default; + ~Tokenizer(); +private: + class TokenizerImpl; + std::shared_ptr m_pimpl; +}; + +/** +* @brief Returns an absolute path. The path is this library's directory + * concatenated with openvino_tokenizers OS specific + * * name (.so, .dll, .dylib, lib prefix). This is part of the interface + * because it's reused in Python bindings. + * tokenizers_relative_to_genai() and ScopedVar allow passing a path to + * openvino_tokenizers through env var removing one argument from + * Tokenizer's constructor. +*/ +OPENVINO_GENAI_EXPORTS std::filesystem::path tokenizers_relative_to_genai(); + +/** +* @brief Sets ENVIRONMENT_VARIABLE_NAME to environment_variable_value + * and unsets in destructor. Does nothing if ENVIRONMENT_VARIABLE_NAME + * was already defined. +*/ +class OPENVINO_GENAI_EXPORTS ScopedVar { +public: + explicit ScopedVar(const std::string& environment_variable_value); + ~ScopedVar(); + bool was_already_set; + static constexpr char ENVIRONMENT_VARIABLE_NAME[] = "OPENVINO_TOKENIZERS_PATH_GENAI"; +}; +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/visibility.hpp b/src/cpp/include/openvino/genai/visibility.hpp new file mode 100644 index 0000000000..4a1a60bb61 --- /dev/null +++ b/src/cpp/include/openvino/genai/visibility.hpp @@ -0,0 +1,12 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/core/visibility.hpp" + +#ifdef openvino_genai_EXPORTS +# define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_EXPORTS +#else +# define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_IMPORTS +#endif // openvino_genai_EXPORTS diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp new file mode 100644 index 0000000000..1bfe6cdb56 --- /dev/null +++ b/src/cpp/src/generation_config.cpp @@ -0,0 +1,129 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include +#include +#include "openvino/genai/generation_config.hpp" +#include "utils.hpp" + + +namespace ov { +namespace genai { + +GenerationConfig::GenerationConfig(const std::string& json_path) { + using ov::genai::utils::read_json_param; + + std::ifstream f(json_path); + OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config"); + + nlohmann::json data = nlohmann::json::parse(f); + + read_json_param(data, "max_new_tokens", max_new_tokens); + read_json_param(data, "max_length", max_length); + // note that ignore_eos is not present in HF GenerationConfig + read_json_param(data, "num_beam_groups", num_beam_groups); + read_json_param(data, "num_beams", num_beams); + read_json_param(data, "diversity_penalty", diversity_penalty); + read_json_param(data, "length_penalty", length_penalty); + read_json_param(data, "num_return_sequences", num_return_sequences); + read_json_param(data, "no_repeat_ngram_size", no_repeat_ngram_size); + read_json_param(data, "temperature", temperature); + read_json_param(data, "top_p", top_p); + read_json_param(data, "top_k", top_k); + read_json_param(data, "do_sample", do_sample); + read_json_param(data, "repetition_penalty", repetition_penalty); + read_json_param(data, "eos_token_id", eos_token_id); + + if (data.contains("early_stopping")) { + auto field_type = data["early_stopping"].type(); + if (field_type == nlohmann::json::value_t::string && data["early_stopping"] == "never") { + stop_criteria = StopCriteria::NEVER; + } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == true) { + stop_criteria = StopCriteria::EARLY; + } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == false) { + stop_criteria = StopCriteria::HEURISTIC; + } + } +} + +void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) { + using ov::genai::utils::read_anymap_param; + + read_anymap_param(config_map, "max_new_tokens", max_new_tokens); + read_anymap_param(config_map, "max_length", max_length); + read_anymap_param(config_map, "ignore_eos", ignore_eos); + read_anymap_param(config_map, "num_beam_groups", num_beam_groups); + read_anymap_param(config_map, "num_beams", num_beams); + read_anymap_param(config_map, "diversity_penalty", diversity_penalty); + read_anymap_param(config_map, "length_penalty", length_penalty); + read_anymap_param(config_map, "num_return_sequences", num_return_sequences); + read_anymap_param(config_map, "no_repeat_ngram_size", no_repeat_ngram_size); + read_anymap_param(config_map, "stop_criteria", stop_criteria); + read_anymap_param(config_map, "temperature", temperature); + read_anymap_param(config_map, "top_p", top_p); + read_anymap_param(config_map, "top_k", top_k); + read_anymap_param(config_map, "do_sample", do_sample); + read_anymap_param(config_map, "repetition_penalty", repetition_penalty); + read_anymap_param(config_map, "eos_token_id", eos_token_id); +} + +size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const { + // max_new_tokens has priority over max_length, only if max_new_tokens was not specified use max_length + if (max_new_tokens != SIZE_MAX) { + return max_new_tokens; + } else { + return max_length - prompt_length; + } +} + +bool GenerationConfig::is_greedy_decoding() const { + return !do_sample && !is_beam_search(); +} + +bool GenerationConfig::is_beam_search() const { + return num_beams > 1; +} + +bool GenerationConfig::is_multinomial() const { + return do_sample; +} + +void GenerationConfig::validate() const { + OPENVINO_ASSERT(!do_sample || num_beams == 1, + "Beam search with sampling is not supported yet. " + "Please either set do_sample=false to use beam search " + "or set num_beams=1 if you with to use multinomial sampling."); + + OPENVINO_ASSERT(max_new_tokens > 0, "'max_new_tokens' must be greater than 0"); + + // max_new_tokens has priority over max_length + // if max_new_tokens is defined no need to check max_length + OPENVINO_ASSERT(max_new_tokens != SIZE_MAX || max_length > 0, + "'max_length' must be greater than 0 or 'max_new_tokens' should be defined"); + + OPENVINO_ASSERT(!do_sample || top_k > 0, + "top_k must be a strictly positive, but got ", + top_k); + OPENVINO_ASSERT(!do_sample || (top_p > 0 && top_p <= 1.0f), + "top_p must be a positive float > 0 and < 1, but got ", + top_p); + OPENVINO_ASSERT(!do_sample || temperature > 0, + "Temperature must be a strictly positive float, but got ", + temperature); + + OPENVINO_ASSERT(repetition_penalty > 0, + "Repetition penalty must be a strictly positive float, but got ", + repetition_penalty); + + OPENVINO_ASSERT(!ignore_eos || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, + "ignore_eos == true, in this case either 'max_new_tokens', or 'max_length' should be defined."); + + OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, + "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp new file mode 100644 index 0000000000..48125b7ab8 --- /dev/null +++ b/src/cpp/src/greedy_decoding.cpp @@ -0,0 +1,146 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" + +namespace ov { +namespace genai { + +EncodedResults greedy_decoding( + ov::InferRequest& m_model_runner, + ov::Tensor input_ids, + ov::Tensor attention_mask, + const ov::genai::GenerationConfig generation_config, + const std::shared_ptr streamer, + const bool is_chat_conversation, + const bool is_cache_empty +) { + ov::Shape prompts_shape = input_ids.get_shape(); + const size_t batch_size = prompts_shape[0]; + size_t running_batch_size = batch_size; + size_t prompt_len = prompts_shape[1]; + + auto num_inputs = m_model_runner.get_compiled_model().inputs().size(); + bool position_ids_available = num_inputs == 4; + ov::Tensor position_ids; + + EncodedResults results; + results.scores.resize(running_batch_size); + results.tokens.resize(running_batch_size); + std::fill(results.scores.begin(), results.scores.end(), 0); + + int64_t kv_cache_len = 0; + if (is_chat_conversation && !is_cache_empty) { + OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1"); + + // between subsequent runs attention_mask should not be modified + auto atten_mask_history = m_model_runner.get_tensor("attention_mask"); + kv_cache_len = atten_mask_history.get_shape()[1]; + + size_t prompt_len = attention_mask.get_shape()[1]; + ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}}; + + std::copy(atten_mask_history.data(), atten_mask_history.data() + kv_cache_len, + new_atten_mask.data()); + std::copy(attention_mask.data(), attention_mask.data() + prompt_len, + new_atten_mask.data() + kv_cache_len); + + m_model_runner.set_tensor("attention_mask", new_atten_mask); + } else if (!is_cache_empty) { + OPENVINO_THROW("KV cache contains initial values but generate is run not in chat scenario. " + "Initial KV cache can contain values only if start_chat() is called."); + } else { + m_model_runner.set_tensor("attention_mask", attention_mask); + } + + if (position_ids_available) { + position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; + utils::initialize_position_ids(position_ids, attention_mask, kv_cache_len); + } + + m_model_runner.set_tensor("input_ids", input_ids); + if (position_ids_available) + m_model_runner.set_tensor("position_ids", position_ids); + + m_model_runner.get_tensor("beam_idx").set_shape({running_batch_size}); + auto beam_data = m_model_runner.get_tensor("beam_idx").data(); + std::iota(beam_data, beam_data + running_batch_size, 0); + + m_model_runner.infer(); + auto logits = m_model_runner.get_tensor("logits"); + ov::Shape logits_shape = logits.get_shape(); + size_t seq_len = logits_shape[1], vocab_size = logits_shape[2]; + m_model_runner.get_tensor("input_ids").set_shape({running_batch_size, 1}); + + std::vector token_iter_results(running_batch_size); // results of a single infer request + std::vector eos_met(running_batch_size, 0); // use int because can not use std::all_of with vector + for (size_t batch = 0; batch < running_batch_size; ++batch) { + auto out_token = utils::argmax(logits, batch); + results.tokens[batch].emplace_back(out_token); + + token_iter_results[batch] = out_token; + eos_met[batch] = (out_token == generation_config.eos_token_id); + m_model_runner.get_tensor("input_ids").data()[batch] = out_token; + } + if (streamer && streamer->put(token_iter_results[0])) { + return results; + } + + bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; }); + if (!generation_config.ignore_eos && all_are_eos) + return results; + + size_t max_tokens = generation_config.get_max_new_tokens(prompt_len); + for (size_t i = 0; i < max_tokens - 1; ++i) { + if (position_ids_available) + utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); + m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); + + m_model_runner.infer(); + auto logits = m_model_runner.get_tensor("logits"); + ov::Shape logits_shape = logits.get_shape(); + size_t seq_len = logits_shape[1], vocab_size = logits_shape[2]; + + std::vector token_iter_results(running_batch_size); // results of a single infer request + std::vector eos_met(running_batch_size, 0); // use int because can not use std::all_of with vector + for (size_t batch = 0; batch < running_batch_size; ++batch) { + auto out_token = ov::genai::utils::argmax(logits, batch); + results.tokens[batch].emplace_back(out_token); + + token_iter_results[batch] = out_token; + eos_met[batch] = (out_token == generation_config.eos_token_id); + + m_model_runner.get_tensor("input_ids").data()[batch] = out_token; + } + + if (streamer && streamer->put(token_iter_results[0])) + return results; + + if (generation_config.ignore_eos) + continue; + + // stop generation when EOS is met in all batches + bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; }); + if (all_are_eos) + break; + + // Filter out batches where eos is met + std::vector beam_idx(running_batch_size); + std::iota(beam_idx.begin(), beam_idx.end(), 0); + auto end_it = std::remove_if(beam_idx.begin(), beam_idx.end(), [&eos_met](int idx) { return eos_met[idx]; }); + beam_idx.erase(end_it, beam_idx.end()); // Remove the eos met indices + + m_model_runner.get_tensor("beam_idx").set_shape({beam_idx.size()}); + auto beam_data = m_model_runner.get_tensor("beam_idx").data(); + std::copy(beam_idx.begin(), beam_idx.end(), beam_data); + running_batch_size = beam_idx.size(); + } + if (streamer) { + streamer->end(); + } + return results; +} + +} //namespace genai +} //namespace ov \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/src/cpp/src/group_beam_searcher.cpp similarity index 67% rename from text_generation/causal_lm/cpp/group_beam_searcher.hpp rename to src/cpp/src/group_beam_searcher.cpp index 6c97c869a3..991c3838f4 100644 --- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -3,6 +3,11 @@ #include +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" + +namespace { + // Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack std::vector kmp_search(const std::vector& haystack, const std::vector& needle) { if (needle.empty()) { // no_repeat_ngram_size == 1, ban every token @@ -80,16 +85,14 @@ bool greater(const Beam& left, const Beam& right) { return left.score > right.score; } -enum class StopCriteria { early, heuristic, never }; - struct Parameters { std::vector> prompts; - int64_t eos_token; + int64_t eos_token_id; size_t n_groups = 3; size_t group_size = 5; float diversity_penalty = 1.0; size_t max_new_tokens = 20; - StopCriteria stop_criteria = StopCriteria::heuristic; + ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::HEURISTIC; float length_penalty = 1.0; size_t no_repeat_ngram_size = std::numeric_limits::max(); @@ -106,11 +109,6 @@ struct Group { void finish(Beam&& beam, const Parameters& parameters) { beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty); - // HF implementation counts eos_token for length penalty calculation - if (beam.tokens.back() == parameters.eos_token) { - beam.tokens.pop_back(); - } - min_heap.push_back(std::move(beam)); std::push_heap(min_heap.begin(), min_heap.end(), greater); if (min_heap.size() > parameters.group_size) { @@ -126,15 +124,15 @@ struct Group { float best_sum_logprobs = ongoing.front().score; float worst_score = min_heap.front().score; switch (parameters.stop_criteria) { - case StopCriteria::early: + case ov::genai::StopCriteria::EARLY: done = true; return; - case StopCriteria::heuristic: { + case ov::genai::StopCriteria::HEURISTIC: { float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); done = worst_score >= highest_attainable_score; return; } - case StopCriteria::never: { + case ov::genai::StopCriteria::NEVER: { size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); done = worst_score >= highest_attainable_score; @@ -267,7 +265,7 @@ struct GroupBeamSearcher { std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); group->ongoing.clear(); for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { - if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) { + if (parameters.eos_token_id == candidates.at(cand_idx).tokens.back()) { // If beam_token does not belong to top num_beams tokens, it should not be added if (cand_idx >= parameters.group_size) { continue; @@ -313,3 +311,140 @@ std::vector>> finalize(GroupBeamSearcher&& group_b return finalized; } + +void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) { + request.set_tensor("input_ids", input_ids); + request.set_tensor("attention_mask", attention_mask); + + ov::Shape input_shape = input_ids.get_shape(); + auto num_inputs = request.get_compiled_model().inputs().size(); + bool position_ids_available = num_inputs == 4; + if (position_ids_available){ + ov::Tensor position_ids = request.get_tensor("position_ids"); + position_ids.set_shape(input_shape); + ov::genai::utils::initialize_position_ids(position_ids, attention_mask); + } + + ov::Tensor beam_idx = request.get_tensor("beam_idx"); + beam_idx.set_shape({input_shape.at(0)}); + std::fill_n(beam_idx.data(), input_shape.at(0), 0); +} + +void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector next_beams) { + ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; + ov::Shape original_shape = original_mask.get_shape(); + attention_mask.copy_to(original_mask); + + ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1}; + attention_mask.set_shape(new_shape); + + for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) { + const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1); + const size_t result_prompt_offset = beam_id * new_shape.at(1); + + int64_t* dest = attention_mask.data() + result_prompt_offset; + const int64_t* src = original_mask.data() + original_prompt_offset; + + std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t)); + attention_mask.data()[result_prompt_offset + new_shape.at(1) - 1] = 1; + } +} + +void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { + const size_t batch_size = attention_mask.get_shape().at(0); + const size_t sequence_length = attention_mask.get_shape().at(1); + position_ids.set_shape({batch_size, 1}); + + for (size_t batch = 0; batch < batch_size; batch++) { + int64_t* mask_start = attention_mask.data() + batch * sequence_length; + position_ids.data()[batch] = std::accumulate(mask_start, mask_start + sequence_length - 1, 0); + } +} + +} // namespace + +namespace ov { +namespace genai { + +EncodedResults beam_search(ov::InferRequest& lm, + ov::Tensor input_ids, + ov::Tensor attention_mask, + GenerationConfig config) { + OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + + // Initialize beam search + const int64_t* prompt_data = input_ids.data(); + std::vector> prompts; + prompts.reserve(input_ids.get_shape().at(0)); + for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) { + size_t sequence_length = input_ids.get_shape().at(1); + size_t batch_offset = batch * sequence_length; + const int64_t* prompt_start = prompt_data + batch_offset; + prompts.push_back(std::vector{prompt_start, prompt_start + sequence_length}); + } + + initialize_inputs(input_ids, attention_mask, lm); + + Parameters parameters{std::move(prompts)}; + parameters.max_new_tokens = config.max_new_tokens; + parameters.eos_token_id = config.eos_token_id; + parameters.n_groups = config.num_beam_groups; + parameters.group_size = config.num_beams / config.num_beam_groups; + parameters.diversity_penalty = config.diversity_penalty; + parameters.length_penalty = config.length_penalty; + parameters.stop_criteria = config.stop_criteria; + parameters.no_repeat_ngram_size = config.no_repeat_ngram_size; + GroupBeamSearcher group_beam_searcher{parameters}; + + std::vector next_tokens; + std::vector next_beams; + auto num_inputs = lm.get_compiled_model().inputs().size(); + bool position_ids_available = num_inputs == 4; + + for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) { + lm.infer(); + + std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); + if (next_tokens.empty()) { + break; + } + size_t batch_size = next_tokens.size(); + // Set pointers + lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()}); + lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()}); + // Set auxiliary inputs + update_attention_mask_with_beams(lm.get_tensor("attention_mask"), next_beams); + if (position_ids_available) + update_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask")); + } + + auto scores_comparator = [](Beam& left, Beam& right) { + return (left.score > right.score); + }; + + std::vector beams; + auto result = finalize(std::move(group_beam_searcher)); + // align output with HF + for (size_t prompt_id = 0; prompt_id < result.size(); prompt_id++) { + auto prompt_group = result.at(prompt_id); + + for (const std::vector group : prompt_group) { + beams.insert(beams.end(), group.begin(), group.end()); + } + + // sort beams per prompt + auto start = beams.begin() + prompt_id * parameters.group_size * parameters.n_groups; + std::sort(start, beams.end(), scores_comparator); + } + + ov::genai::EncodedResults results; + for (auto beam = beams.begin(); beam != beams.begin() + config.num_return_sequences; ++beam) { + results.scores.emplace_back(beam->score); + results.tokens.emplace_back(beam->tokens); + } + return results; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp new file mode 100644 index 0000000000..ae763729a3 --- /dev/null +++ b/src/cpp/src/llm_pipeline.cpp @@ -0,0 +1,438 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" +#include "text_callback_streamer.hpp" + +namespace { + +const std::string STREAMER_ARG_NAME = "streamer"; +const std::string CONFIG_ARG_NAME = "generation_config"; + +ov::genai::GenerationConfig from_config_json_if_exists(const std::filesystem::path& model_path) { + auto config_file_path = model_path / "generation_config.json"; + if (std::filesystem::exists(config_file_path)) { + return ov::genai::GenerationConfig((config_file_path).string()); + } else { + return ov::genai::GenerationConfig{}; + } +} + +std::string chat_template_from_tokenizer_json_if_exists(const std::filesystem::path& path) { + auto tokenizer_config_file_path = path / "tokenizer_config.json"; + if (!std::filesystem::exists(tokenizer_config_file_path)) + return ""; + + std::ifstream file(tokenizer_config_file_path); + if (!file.is_open()) + return ""; + + std::string res = ""; + ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); + return res; +} + +ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map) { + ov::genai::StreamerVariant streamer = std::monostate(); + + if (config_map.count(STREAMER_ARG_NAME)) { + auto any_val = config_map.at(STREAMER_ARG_NAME); + if (any_val.is>()) { + streamer = any_val.as>(); + } else if (any_val.is>()) { + streamer = any_val.as>(); + } + } + return streamer; +} + +ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config_map) { + if (config_map.count(CONFIG_ARG_NAME)) + return config_map.at(CONFIG_ARG_NAME).as(); + else + return std::nullopt; +} + +} + +namespace ov { +namespace genai { + +ov::genai::EncodedResults greedy_decoding( + ov::InferRequest& model_runner, + ov::Tensor prompts, + ov::Tensor attention_mask, + const GenerationConfig sampling_params, + const std::shared_ptr streamer, + const bool is_chat_conversation = false, + const bool is_cache_empty = true +); + +ov::genai::EncodedResults multinominal_decoding( + ov::InferRequest& model_runner, + ov::Tensor prompts, + ov::Tensor attention_mask, + GenerationConfig sampling_params, + std::shared_ptr streamer +); + +EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig config); + + +class LLMPipeline::LLMPipelineImpl { +public: + ov::InferRequest m_model_runner; + Tokenizer m_tokenizer; + GenerationConfig m_generation_config; + std::string m_chat_template = ""; + bool is_chat_conversation = false; + bool m_is_cache_empty = true; + + LLMPipelineImpl( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, + OptionalGenerationConfig generation_config=std::nullopt + ): m_model_runner(request), m_tokenizer(tokenizer) { + GenerationConfig default_config; + m_generation_config = (generation_config.has_value()) ? *generation_config : default_config; + } + + LLMPipelineImpl( + const std::filesystem::path& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config + ); + + LLMPipelineImpl( + const std::filesystem::path& path, + const std::string& device, + const ov::AnyMap& config + ); + + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) { + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + EncodedInputs encoded_input; + + if (auto input_vector = std::get_if>(&inputs)) { + encoded_input = m_tokenizer.encode(*input_vector); + } else if (auto input_str = std::get_if(&inputs)) { + + std::string text = *input_str; + // todo: make for batched inputs as well + if (is_chat_conversation) + text = apply_chat_template(text); + + // previous prompt generation in chat dialog stops with the end of sentence token, + // need to append this token to the current prompt + if (is_chat_conversation && !m_is_cache_empty) + text = m_tokenizer.get_eos_token() + text; + + auto res = m_tokenizer.encode(text); + auto input_ids = res.input_ids; + auto attention_mask = res.attention_mask; + + // todo: W/A If sentence begins with specfial tokens (, , etc.) openvino_tokenizer inserts 2 special extra tokens and "▁", + // but HF does not do that. Moreover openvino_tokenizer always inserts but in chat scenario HF does not do that because skip_special_tokens=True. + // Need to remove both of that tokens manually to get exact token by token alignment with HF + auto size = input_ids.get_shape(); + int64_t* inputs_data = input_ids.data(); + std::vector tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1 + + auto attention_mask_data = attention_mask.data(); + std::vector tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size()); + + std::vector prefixes_to_exclude = {m_tokenizer.get_eos_token(), m_tokenizer.get_bos_token()}; + auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; }; + if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) { + tmp_ids.erase(tmp_ids.begin()); + tmp_attn_mask.erase(tmp_attn_mask.begin()); + } + + input_ids = ov::Tensor(input_ids.get_element_type(), {1, tmp_ids.size()}); + attention_mask = ov::Tensor(attention_mask.get_element_type(), {1, tmp_attn_mask.size()}); + std::copy(tmp_ids.begin(), tmp_ids.end(), input_ids.data()); + std::copy(tmp_attn_mask.begin(), tmp_attn_mask.end(), attention_mask.data()); + + encoded_input = TokenizedInputs{input_ids, attention_mask}; + } + + auto encoded_results = generate(encoded_input, config, streamer); + return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + } + + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) { + ov::Tensor input_ids; + ov::Tensor attention_mask; + + if (auto data = std::get_if(&inputs)) { + input_ids = *data; + } else if (auto data = std::get_if(&inputs)) { + input_ids = data->input_ids; + attention_mask = data->attention_mask; + } + + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + + // If eos_token_id was not provided, take value from default m_generation_config + if (config.eos_token_id == -1) + config.eos_token_id = m_generation_config.eos_token_id; + config.validate(); + + std::shared_ptr streamer_ptr; + if (auto streamer_obj = std::get_if(&streamer)) { + streamer_ptr = nullptr; + } else if (auto streamer_obj = std::get_if>(&streamer)) { + streamer_ptr = *streamer_obj; + } else if (auto callback = std::get_if>(&streamer)) { + streamer_ptr = std::make_shared(m_tokenizer, *callback); + } + + auto batch_size = input_ids.get_shape().at(0); + if ((batch_size != 1 || !(config.is_greedy_decoding() || config.is_multinomial())) && streamer_ptr) { + OPENVINO_THROW("Currently streaming is possible only with batch size=1 and " + "only for greedy or multinomial decoding"); + } + + auto num_inputs = m_model_runner.get_compiled_model().inputs().size(); + OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: " + "either (input_ids, attention_mask, beam_idx) or " + "(input_ids, attention_mask, position_ids, beam_idx) " + "but you have '" + std::to_string(num_inputs) + "' inputs"); + + ov::genai::EncodedResults result; + if (config.is_greedy_decoding()) { + result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask, + config, streamer_ptr, + is_chat_conversation, m_is_cache_empty); + } else if (config.is_beam_search()) { + result = beam_search(m_model_runner, input_ids, attention_mask, config); + } else if (config.is_multinomial()) { + result = multinominal_decoding(m_model_runner, input_ids, attention_mask, config, streamer_ptr); + } else { + OPENVINO_THROW("No decoding algorithm found for provided configuration parameters."); + } + + if (!is_chat_conversation) { + m_model_runner.reset_state(); + } else { + m_is_cache_empty = false; + } + + return result; + } + + std::string apply_chat_template(const std::vector>& prompts) const { + jinja2::TemplateEnv env; + env.GetSettings().lstripBlocks = true; + env.GetSettings().trimBlocks = true; + jinja2::Template tpl(&env); + tpl.Load(m_chat_template); + + jinja2::ValuesList messages; + for (const auto& [prompt, role] : prompts) { + messages.push_back(jinja2::ValuesMap{{"role", role}, {"content", prompt}}); + } + + jinja2::ValuesMap params = { + {"messages", messages}, + {"bos_token", m_tokenizer.get_bos_token()}, + {"eos_token", m_tokenizer.get_eos_token()}, + {"add_generation_prompt", true}, + }; + + return tpl.RenderAsString(params).value(); + } + + std::string apply_chat_template(std::string prompt, std::string role = "user") const { + jinja2::TemplateEnv env; + env.GetSettings().lstripBlocks = true; + env.GetSettings().trimBlocks = true; + jinja2::Template tpl(&env); + tpl.Load(m_chat_template); + + jinja2::ValuesMap message {{"role", role}, {"content", prompt}}; + jinja2::ValuesMap params = { + {"messages", jinja2::ValuesList({message})}, + {"bos_token", m_tokenizer.get_bos_token()}, + {"eos_token", m_tokenizer.get_eos_token()}, + {"add_generation_prompt", true}, + }; + + return tpl.RenderAsString(params).value(); + } + + std::vector apply_chat_template(std::vector& prompts, std::string role = "user") const { + std::vector res; + for (const auto& prompt: prompts) { + res.emplace_back(apply_chat_template(prompt)); + } + return res; + } +}; + +DecodedResults LLMPipeline::generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + return m_pimpl->generate(inputs, generation_config, streamer); +} + +DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) { + auto config_arg = get_config_from_map(config_map); + GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + + return m_pimpl->generate(text, config, get_streamer_from_map(config_map)); +} + +EncodedResults LLMPipeline::generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + return m_pimpl->generate(inputs, generation_config, streamer); +} + +EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) { + auto config_arg = get_config_from_map(config_map); + GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + + return m_pimpl->generate(inputs, config, get_streamer_from_map(config_map)); +} + +std::pair streamer(StreamerVariant func) { + if (auto streamer_obj = std::get_if>(&func)) { + return {STREAMER_ARG_NAME, Any::make>(*streamer_obj)}; + } else { + auto callback = std::get>(func); + return {STREAMER_ARG_NAME, Any::make>(callback)}; + } +} + +std::pair generation_config(const GenerationConfig& config) { + return {CONFIG_ARG_NAME, Any::make(config)}; +} + +} // namespace genai +} // namespace ov + +using namespace std; + +ov::genai::LLMPipeline::LLMPipeline( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, + OptionalGenerationConfig generation_config +) { + m_pimpl = std::make_unique(request, tokenizer, generation_config); +} + + +ov::genai::LLMPipeline::LLMPipeline( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config +) { + m_pimpl = make_unique(std::filesystem::path(model_path), tokenizer, device, plugin_config); +} + +ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( + const std::filesystem::path& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config +): m_tokenizer(tokenizer) { + ov::Core core; + + std::filesystem::path full_path = model_path; + if (full_path.extension() != ".xml") + full_path = model_path / "openvino_model.xml"; + m_model_runner = core.compile_model(full_path, device, plugin_config).create_infer_request(); +} + +ov::genai::LLMPipeline::LLMPipeline( + const std::string& path, + const std::string& device, + const ov::AnyMap& config +) { + m_pimpl = make_unique(std::filesystem::path(path), device, config); +} + +ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( + const std::filesystem::path& path, + const std::string& device, + const ov::AnyMap& config +): + m_model_runner{ov::Core{}.compile_model(path / "openvino_model.xml", device, config).create_infer_request()}, + m_tokenizer(path.string()), + m_generation_config{from_config_json_if_exists(path)}, + m_chat_template{chat_template_from_tokenizer_json_if_exists(path)} +{ + // If eos_token_id was not provided, take value + if (m_generation_config.eos_token_id == -1) + m_generation_config.eos_token_id = m_tokenizer.get_eos_token_id(); +} + +ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { + return m_pimpl->m_generation_config; +} + +ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() { + return m_pimpl->m_tokenizer; +} + +std::string ov::genai::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const { + return m_pimpl->apply_chat_template(prompt, role); +} + + +void ov::genai::LLMPipeline::start_chat() { + m_pimpl->is_chat_conversation = true; + if (!m_pimpl->m_is_cache_empty) { + m_pimpl->m_model_runner.reset_state(); + m_pimpl->m_is_cache_empty = true; + } +} + +void ov::genai::LLMPipeline::finish_chat() { + m_pimpl->is_chat_conversation = false; + if (!m_pimpl->m_is_cache_empty) { + m_pimpl->m_model_runner.reset_state(); + m_pimpl->m_is_cache_empty = true; + } +} + +void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& config) { + int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;; + m_pimpl->m_generation_config = config; + // if eos_token_id was not provided in config forward from default config + if (config.eos_token_id == -1) + m_pimpl->m_generation_config.eos_token_id = default_eos_token_id; + + m_pimpl->m_generation_config.validate(); +} + +ov::genai::LLMPipeline::~LLMPipeline() = default; diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp new file mode 100644 index 0000000000..60d2d87b78 --- /dev/null +++ b/src/cpp/src/multinomial_decoding.cpp @@ -0,0 +1,256 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" + + +namespace { + +struct TokenIdScore { + int64_t id; + float score; + + bool operator<(const TokenIdScore& other) const { + return score < other.score; + } + + bool operator>(const TokenIdScore& other) const { + return score > other.score; + } +}; + +void apply_softmax_inplace(std::vector& tokens) { + float max_score = std::max_element(tokens.begin(), tokens.end())->score; + float sum = 0.f; + + for (auto& token : tokens) { + float s = std::exp(token.score - max_score); + token.score = s; + sum += s; + } + + float inv_sum = 1.f / sum; + + for (auto& token : tokens) { + token.score *= inv_sum; + } +} + +TokenIdScore* sample_top_p(TokenIdScore* first, TokenIdScore* last, float top_p) { + // sort score + std::sort(first, last, std::greater()); + + int tokens_size = last - first; + std::vector token_scores(tokens_size); + for (size_t i = 0; i < tokens_size; i++) { + token_scores[i] = first[i]; + } + + // calculate softmax + apply_softmax_inplace(token_scores); + + float prefix_sum = 0.0f; + + // top_p + for (size_t i = 0; i < tokens_size; i++) { + prefix_sum += token_scores[i].score; + if (prefix_sum >= top_p) { + return first + (i + 1); + } + } + + return last; +} + +void apply_repetition_penalty(float* first, float* last, const std::vector& input_ids, float penalty) { + const float inv_penalty = 1.f / penalty; + const int vocab_size = last - first; + std::vector occurrence(vocab_size, false); + for (const int64_t id : input_ids) { + if (!occurrence[id]) { + first[id] *= (first[id] > 0) ? inv_penalty : penalty; + } + occurrence[id] = true; + } +} + +void apply_inv_temperature(float* first, float* last, float inv_temperature) { + for (float* it = first; it != last; it++) { + *it *= inv_temperature; + } +} + +struct RandomSampling { + const size_t top_k; + const float top_p; + const float inv_temperature; + const float repetition_penalty; + + std::mt19937 gen{std::random_device{}()}; + + RandomSampling(ov::genai::GenerationConfig generation_config) + : top_k{generation_config.top_k}, + top_p{generation_config.top_p}, + inv_temperature{1.f / generation_config.temperature}, + repetition_penalty{generation_config.repetition_penalty} { + generation_config.validate(); + } + + TokenIdScore get_out_token(float* logits, size_t vocab_size, const std::vector& tokens) { + // logits pre-process + if (repetition_penalty != 1.0f) { + apply_repetition_penalty(logits, logits + vocab_size, tokens, repetition_penalty); + } + + if (inv_temperature != 1.0f) { + apply_inv_temperature(logits, logits + vocab_size, inv_temperature); + } + + std::vector token_scores(vocab_size); + for (size_t i = 0; i < vocab_size; i++) { + token_scores[i] = TokenIdScore{int64_t(i), logits[i]}; + } + + // top_k sampling + if (0 < top_k && top_k < token_scores.size()) { + std::nth_element(token_scores.data(), + token_scores.data() + top_k, + token_scores.data() + token_scores.size(), + std::greater()); + token_scores.resize(top_k); + } + + // top_p sampling + if (0.f < top_p && top_p < 1.0f) { + auto pos = sample_top_p(token_scores.data(), token_scores.data() + token_scores.size(), top_p); + token_scores.resize(pos - token_scores.data()); + } + + // sample next token + apply_softmax_inplace(token_scores); + for (size_t i = 0; i < token_scores.size(); i++) { + logits[i] = token_scores[i].score; + } + + std::discrete_distribution<> dist(logits, logits + token_scores.size()); + return token_scores[dist(gen)]; + } +}; +} // namespace + +namespace ov { +namespace genai { + +ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner, + ov::Tensor input_ids, + ov::Tensor attention_mask, + ov::genai::GenerationConfig config, + std::shared_ptr streamer) { + ov::Shape prompts_shape = input_ids.get_shape(); + size_t batch_size = prompts_shape[0]; + + OPENVINO_ASSERT(batch_size == 1, "Only batch size = 1 supported for multinomial decoding"); + + size_t prompt_len = prompts_shape[1]; + + ov::genai::EncodedResults results; + results.scores.resize(batch_size, 0); + results.tokens.resize(batch_size); + + // Initialize inputs + m_model_runner.set_tensor("input_ids", input_ids); + m_model_runner.set_tensor("attention_mask", attention_mask); + + auto num_inputs = m_model_runner.get_compiled_model().inputs().size(); + bool position_ids_available = num_inputs == 4; + if (position_ids_available) { + ov::Tensor position_ids = m_model_runner.get_tensor("position_ids"); + position_ids.set_shape(input_ids.get_shape()); + std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); + } + + // Input values are persistent between inference calls. + // That allows to set values, which aren't going to change, only once + m_model_runner.get_tensor("beam_idx").set_shape({batch_size}); + m_model_runner.get_tensor("beam_idx").data()[0] = 0; + + m_model_runner.infer(); + + auto logits_tensor = m_model_runner.get_tensor("logits"); + + int64_t sequence_offset = logits_tensor.get_shape().at(1) - 1; + size_t vocab_size = logits_tensor.get_shape().back(); + + float* logits = logits_tensor.data() + sequence_offset * vocab_size; + + const int64_t* input_ids_data = input_ids.data(); + + std::vector tokens{input_ids_data, input_ids_data + input_ids.get_size()}; + + RandomSampling sampling{config}; + + TokenIdScore out_token = sampling.get_out_token(logits, vocab_size, tokens); + + tokens.push_back(out_token.id); + results.tokens[0].push_back(out_token.id); + results.scores[0] += out_token.score; + + if (streamer && streamer->put(out_token.id)) { + return results; + } + + if (!config.ignore_eos && out_token.id == config.eos_token_id) { + return results; + } + + m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1}); + if (position_ids_available) + m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1}); + + size_t max_new_tokens = config.get_max_new_tokens(prompt_len); + + for (size_t i = 0; i < max_new_tokens - 1; i++) { + if (position_ids_available) { + ov::genai::utils::update_position_ids(m_model_runner.get_tensor("position_ids"), + m_model_runner.get_tensor("attention_mask")); + } + m_model_runner.set_tensor("attention_mask", + ov::genai::utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); + + m_model_runner.get_tensor("input_ids").data()[0] = out_token.id; + + m_model_runner.infer(); + + logits = m_model_runner.get_tensor("logits").data(); + out_token = sampling.get_out_token(logits, vocab_size, tokens); + + tokens.push_back(out_token.id); + results.tokens[0].push_back(out_token.id); + results.scores[0] += out_token.score; + + if (streamer && streamer->put(out_token.id)) { + return results; + } + + if (!config.ignore_eos && out_token.id == config.eos_token_id) { + break; + } + } + + if (streamer) { + streamer->end(); + } + + return results; +} +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp new file mode 100644 index 0000000000..8302594655 --- /dev/null +++ b/src/cpp/src/text_callback_streamer.cpp @@ -0,0 +1,45 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text_callback_streamer.hpp" + +namespace ov { +namespace genai { + +TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback) { + m_tokenizer = tokenizer; + on_finalized_subword_callback = callback; +} + +bool TextCallbackStreamer::put(int64_t token) { + std::stringstream res; + m_tokens_cache.push_back(token); + std::string text = m_tokenizer.decode(m_tokens_cache); + if (!text.empty() && '\n' == text.back()) { + // Flush the cache after the new line symbol + res << std::string_view{text.data() + print_len, text.size() - print_len}; + m_tokens_cache.clear(); + print_len = 0; + return on_finalized_subword_callback(res.str()); + } + if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { + // Don't print incomplete text + return on_finalized_subword_callback(res.str()); + } + res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); + return on_finalized_subword_callback(res.str()); +} + +void TextCallbackStreamer::end() { + std::stringstream res; + std::string text = m_tokenizer.decode(m_tokens_cache); + res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + m_tokens_cache.clear(); + print_len = 0; + on_finalized_subword_callback(res.str()); + return; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp new file mode 100644 index 0000000000..7afc52b4f6 --- /dev/null +++ b/src/cpp/src/text_callback_streamer.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/tokenizer.hpp" + +namespace ov { +namespace genai { + +class TextCallbackStreamer: public StreamerBase { +public: + bool put(int64_t token) override; + void end() override; + + TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback); + + std::function on_finalized_subword_callback = [](std::string words)->bool { return false; }; +private: + Tokenizer m_tokenizer; + std::vector m_tokens_cache; + size_t print_len = 0; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp new file mode 100644 index 0000000000..3b79354594 --- /dev/null +++ b/src/cpp/src/tokenizer.cpp @@ -0,0 +1,428 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "openvino/genai/tokenizer.hpp" +#include "utils.hpp" +#include + +namespace { + +// todo: remove when openvino-tokenizers will support left padding +ov::genai::TokenizedInputs pad_left(ov::Tensor& input_ids, ov::Tensor& attention_mask, int64_t pad_token_id) { + const size_t batch_size = input_ids.get_shape()[0]; + const size_t sequence_length = input_ids.get_shape()[1]; + int64_t* inputs_data = input_ids.data(); + int64_t* attention_mask_data = attention_mask.data(); + + for (size_t batch = 0; batch < batch_size; batch++) { + const size_t batch_offset = batch * sequence_length; + + // last token in the sequence is not a PAD_TOKEN, skipping + if (inputs_data[batch_offset + sequence_length - 1] != pad_token_id) + continue; + + size_t pad_tokens_number = 0; + for (int i = sequence_length - 1; i >= 0; i--) { + const size_t token_offset = batch_offset + i; + + if (inputs_data[token_offset] == pad_token_id) + continue; + + if (pad_tokens_number == 0) + pad_tokens_number = sequence_length - i - 1; + + std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]); + std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]); + } + } + + return {input_ids, attention_mask}; +} + +#ifdef _WIN32 +# include +# define MAX_ABS_PATH _MAX_PATH +# define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH) +#else +# include +# include +# define MAX_ABS_PATH PATH_MAX +# define get_absolute_path(result, path) realpath(path.c_str(), result) + +std::string get_absolute_file_path(const std::string& path) { + std::string absolutePath; + absolutePath.resize(MAX_ABS_PATH); + std::ignore = get_absolute_path(&absolutePath[0], path); + if (!absolutePath.empty()) { + // on Linux if file does not exist or no access, function will return NULL, but + // `absolutePath` will contain resolved path + absolutePath.resize(absolutePath.find('\0')); + return std::string(absolutePath); + } + std::stringstream ss; + ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno); + throw std::runtime_error(ss.str()); +} +#endif + +std::string get_ov_genai_library_path() { + #ifdef _WIN32 + CHAR genai_library_path[MAX_PATH]; + HMODULE hm = NULL; + if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + reinterpret_cast(get_ov_genai_library_path), + &hm)) { + std::stringstream ss; + ss << "GetModuleHandle returned " << GetLastError(); + throw std::runtime_error(ss.str()); + } + GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path)); + return std::string(genai_library_path); + #elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__) + Dl_info info; + dladdr(reinterpret_cast(get_ov_genai_library_path), &info); + return get_absolute_file_path(info.dli_fname).c_str(); + #else + # error "Unsupported OS" + #endif // _WIN32 +} + +std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { + #ifdef _WIN32 + constexpr char tokenizers[] = "openvino_tokenizers.dll"; + #elif __linux__ + constexpr char tokenizers[] = "libopenvino_tokenizers.so"; + #elif __APPLE__ + constexpr char tokenizers[] = "libopenvino_tokenizers.dylib"; + #endif + return path.parent_path() / tokenizers; +} + +constexpr char bos_token_key_name[] = "bos_token"; +constexpr char eos_token_key_name[] = "eos_token"; +constexpr char pad_token_key_name[] = "pad_token"; + +} // namespace + +namespace ov { +namespace genai { + +class Tokenizer::TokenizerImpl { +public: + ov::InferRequest m_tokenize_request; + ov::InferRequest m_detokenizer_request; + int64_t m_pad_token_id = -1; + int64_t m_bos_token_id = -1; + int64_t m_eos_token_id = -1; + + std::string m_pad_token = ""; + std::string m_bos_token = ""; + std::string m_eos_token = ""; + + TokenizerImpl() = default; + + TokenizerImpl(std::filesystem::path tokenizer_path) { + ov::Core core; + + if (tokenizer_path.extension() == ".xml") + OPENVINO_THROW("ov_tokenizers_path should be a path to a dir not a xml file"); + + const char* ov_tokenizers_path = getenv(ScopedVar::ENVIRONMENT_VARIABLE_NAME); + if (ov_tokenizers_path) { + core.add_extension(ov_tokenizers_path); + } else { + OPENVINO_THROW("openvino_tokenizers path is not set"); + } + + read_config(tokenizer_path); + read_special_tokens_map(tokenizer_path); + + // Try to read tokenizer_config if some token ids or token str are not defined. + read_tokenizer_config_if_necessary(tokenizer_path); + + auto device = "CPU"; // currently openvino_tokenizer supports only CPU + m_tokenize_request = core.compile_model(tokenizer_path / "openvino_tokenizer.xml", + device).create_infer_request(); + m_detokenizer_request = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", + device).create_infer_request(); + + // Get special token ids by inference if they are not defined. + // todo: do not call until CVS-143410 is resolved + // infer_special_tokens_if_necessary(); + } + + // load special tokens ids from config.json + void read_config(const std::filesystem::path& tokenizer_path) { + auto config_file_path = tokenizer_path / "config.json"; + if (!std::filesystem::exists(config_file_path)) + return ; + std::ifstream file(config_file_path); + if (!file.is_open()) + return ; + + nlohmann::json data = nlohmann::json::parse(file); + using ov::genai::utils::read_json_param; + + read_json_param(data, "pad_token_id", m_pad_token_id); + read_json_param(data, "bos_token_id", m_bos_token_id); + read_json_param(data, "eos_token_id", m_eos_token_id); + } + + // Reads the string representation of special tokens if they exist. + void read_special_tokens_map(const std::filesystem::path& tokenizer_path) { + auto special_tokens_file_path = tokenizer_path / "special_tokens_map.json"; + if (!std::filesystem::exists(special_tokens_file_path)) + return ; + std::ifstream f(special_tokens_file_path); + if (!f.is_open()) + return ; + + nlohmann::json data = nlohmann::json::parse(f); + + using ov::genai::utils::read_json_param; + // they are in the format {"bos_token": { "content": "",... }} + auto read_token_content_str = [&data](std::string key_name, std::string& val) { + if (val == "" && data.contains(key_name)) { read_json_param(data[key_name], "content", val); } + }; + read_token_content_str(pad_token_key_name, m_pad_token); + read_token_content_str(bos_token_key_name, m_bos_token); + read_token_content_str(eos_token_key_name, m_eos_token); + } + + // Read string representation of special tokens if they exists. + // Also tries to load special token ids from added_tokens_decoder if they exist. + // Will not override special token strings or ids if they already exist + void read_tokenizer_config_if_necessary(const std::filesystem::path& tokenizer_path) { + if (m_pad_token_id != -1 && m_bos_token_id != -1 && m_eos_token_id != -1 && + !m_pad_token.empty() && !m_bos_token.empty() && !m_eos_token.empty()) { + return ; + } + + auto tokenizer_config_file_path = tokenizer_path / "tokenizer_config.json"; + if (!std::filesystem::exists(tokenizer_config_file_path)) + return ; + std::ifstream f(tokenizer_config_file_path); + if (!f.is_open()) + return ; + + nlohmann::json data = nlohmann::json::parse(f); + + // read special tokens string representation + // if they are presented directly {"bos_token": ""} + using ov::genai::utils::read_json_param; + auto read_token_str = [&data](std::string key_name, std::string& val) { + if (val.empty()) { read_json_param(data, key_name, val); } + }; + read_token_str(pad_token_key_name, m_pad_token); + read_token_str(bos_token_key_name, m_bos_token); + read_token_str(eos_token_key_name, m_eos_token); + + // if special tokens are not loaded directly, try to read + // if they are in the format {"bos_token": { "content": "",... }} + auto read_token_content_str = [&data](std::string key_name, std::string& val) { + if (val.empty() && data.contains(key_name)) { read_json_param(data[key_name], "content", val); } + }; + read_token_content_str(pad_token_key_name, m_pad_token); + read_token_content_str(bos_token_key_name, m_bos_token); + read_token_content_str(eos_token_key_name, m_eos_token); + + // special token ids integer representation are already defined + if (m_pad_token_id != -1 && m_bos_token_id != -1 && m_eos_token_id != -1) + return ; + + // values are stored as {"added_tokens_decoder": {"0": {"content": ""}}} + // token id is a key in the form of a string, need to do std::stoi + std::string spec_tokens_key_name = "added_tokens_decoder"; + if (!data.contains(spec_tokens_key_name)) + return ; + + // if added_tokens_decoder has different format items() will not fail + for (auto& [key, value] : data[spec_tokens_key_name].items()) { + if (!value.contains("content")) + continue; + auto content = value["content"]; + if (m_pad_token_id == -1 && content == m_pad_token) + m_pad_token_id = std::stoi(key); + if (m_bos_token_id == -1 && content == m_bos_token) + m_bos_token_id = std::stoi(key); + if (m_eos_token_id == -1 && content == m_eos_token) + m_eos_token_id = std::stoi(key); + } + } + + // tokenize str representation to get special tokens integer values + void infer_special_tokens_if_necessary() { + auto get_id_from_str = [this](std::string token_str, int64_t& token_val) { + if (token_val != -1 || token_str.empty()) + return ; + auto token_ids_tensor = this->encode(token_str).input_ids; + auto data = token_ids_tensor.data(); + auto data_len = token_ids_tensor.get_shape()[1]; + token_val = data[data_len - 1]; + }; + get_id_from_str(m_pad_token, m_pad_token_id); + get_id_from_str(m_bos_token, m_bos_token_id); + get_id_from_str(m_eos_token, m_eos_token_id); + } + + TokenizedInputs encode(std::string prompt) { + size_t batch_size = 1; + m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt}); + m_tokenize_request.infer(); + return get_copied_results(); + } + + TokenizedInputs encode(std::vector& prompts) { + m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); + auto size_ = m_tokenize_request.get_input_tensor().get_shape(); + m_tokenize_request.infer(); + + auto res = get_copied_results(); + pad_left(res.input_ids, res.attention_mask, m_pad_token_id); + return {res.input_ids, res.attention_mask}; + } + + TokenizedInputs get_copied_results() { + auto input_ids = m_tokenize_request.get_tensor("input_ids"); + auto attention_mask = m_tokenize_request.get_tensor("attention_mask"); + ov::Tensor input_ids_ = ov::Tensor(input_ids.get_element_type(), input_ids.get_shape()); + ov::Tensor attention_mask_ = ov::Tensor(attention_mask.get_element_type(), attention_mask.get_shape()); + input_ids.copy_to(input_ids_); + attention_mask.copy_to(attention_mask_); + + return {input_ids_, attention_mask_}; + } + + std::string decode(std::vector tokens) { + size_t batch_size = 1; + m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()}); + m_detokenizer_request.infer(); + return m_detokenizer_request.get_output_tensor().data()[0]; + } + + std::vector decode(ov::Tensor tokens) { + OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64"); + OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]"); + + m_detokenizer_request.set_input_tensor(tokens); + m_detokenizer_request.infer(); + + auto res = m_detokenizer_request.get_output_tensor(); + auto res_data = res.data(); + return std::vector(res_data, res_data + res.get_shape()[0]); + } + + std::vector decode(std::vector> lines) { + auto compare_lengths = [](const std::vector& a, const std::vector& b) { + return a.size() < b.size(); + }; + size_t max_len = std::max_element(lines.begin(), lines.end(), compare_lengths)->size(); + + ov::Tensor tokens = ov::Tensor{ov::element::i64, {lines.size(), max_len}}; + auto tokens_data = tokens.data(); + + for (size_t i = 0; i < lines.size(); ++i) { + const auto& line = lines[i]; + size_t line_len = line.size(); + std::copy(line.begin(), line.end(), tokens_data + i * max_len); + std::fill(tokens_data + i * max_len + line_len, tokens_data + (i + 1) * max_len, m_pad_token_id); + } + + m_detokenizer_request.set_input_tensor(tokens); + m_detokenizer_request.infer(); + auto res = m_detokenizer_request.get_output_tensor(); + auto res_data = res.data(); + return std::vector(res_data, res_data + res.get_shape()[0]); + } +}; + +Tokenizer::Tokenizer(const std::string& tokenizer_path) { + ov::genai::ScopedVar env_manager(tokenizers_relative_to_genai().string()); + m_pimpl = std::make_shared(tokenizer_path); +} + +TokenizedInputs Tokenizer::encode(const std::string prompt) { + return m_pimpl->encode(std::move(prompt)); +} + +TokenizedInputs Tokenizer::encode(std::vector& prompts) { + return m_pimpl->encode(prompts); +} + +TokenizedInputs Tokenizer::encode(std::vector&& prompts) { + return m_pimpl->encode(prompts); +} + +TokenizedInputs Tokenizer::encode(std::initializer_list& text) { + return encode(std::vector(text.begin(), text.end())); +} + +std::string Tokenizer::decode(std::vector tokens) { + return m_pimpl->decode(tokens); +} + +std::vector Tokenizer::decode(ov::Tensor tokens) { + return m_pimpl->decode(tokens); +} + +std::vector Tokenizer::decode(std::vector> lines) { + return m_pimpl->decode(lines); +} + +int64_t Tokenizer::get_bos_token_id() const { + return m_pimpl->m_bos_token_id; +} + +int64_t Tokenizer::get_eos_token_id() const { + return m_pimpl->m_eos_token_id; +} + +int64_t Tokenizer::get_pad_token_id() const { + return m_pimpl->m_pad_token_id; +} + +std::string Tokenizer::get_pad_token() const { + return m_pimpl->m_pad_token; +} + +std::string Tokenizer::get_bos_token() const { + return m_pimpl->m_bos_token; +} + +std::string Tokenizer::get_eos_token() const { + return m_pimpl->m_eos_token; +} + +Tokenizer::~Tokenizer() = default; + +std::filesystem::path tokenizers_relative_to_genai() { + return with_openvino_tokenizers(get_ov_genai_library_path()); +} + +ScopedVar::ScopedVar(const std::string& environment_variable_value) { +#ifdef _WIN32 + char* value = nullptr; + size_t len = 0; + _dupenv_s(&value, &len, ENVIRONMENT_VARIABLE_NAME); + if (value == nullptr) + _putenv_s(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str()); +#else + if (!getenv(ENVIRONMENT_VARIABLE_NAME)) + setenv(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str(), 1); +#endif + else + was_already_set = true; +} + +ScopedVar::~ScopedVar() { + if (!was_already_set) { +#ifdef _WIN32 + _putenv_s(ENVIRONMENT_VARIABLE_NAME, ""); +#else + unsetenv(ENVIRONMENT_VARIABLE_NAME); +#endif + } +} +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp new file mode 100644 index 0000000000..4b36c72009 --- /dev/null +++ b/src/cpp/src/utils.cpp @@ -0,0 +1,160 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "utils.hpp" +#include + +namespace ov { +namespace genai { +namespace utils { + +Tensor init_attention_mask(Tensor& position_ids) { + auto shape = position_ids.get_shape(); + auto attention_mask = ov::Tensor{position_ids.get_element_type(), shape}; + std::fill_n(attention_mask.data(), shape[0] * shape[1], 1); + return attention_mask; +} + +void print_tensor(const ov::Tensor& tensor) { + std::vector res; + + auto t_shape = tensor.get_shape(); + std::cout << "["; + for (size_t i = 0; i < t_shape[0]; ++i) { + std::cout << "|"; + for (size_t j = 0; j < t_shape[1]; ++j) { + if (tensor.get_element_type() == ov::element::i64) { + res.emplace_back(tensor.data()[t_shape[1] * i + j]); + std::cout << tensor.data()[t_shape[1] * i + j] << " "; + } + } + std::cout << "|"; + } + std::cout << "]" << std::endl; +} + +int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) { + if (logits.get_shape()[0] <= batch_idx) { + OPENVINO_THROW("logits batch size doesn't match the number of beams"); + } + + size_t vocab_size = logits.get_shape().back(); + size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size; + size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size; + const float* logits_data = logits.data() + batch_offset + sequence_offset; + + int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data; + float max_logit = logits_data[out_token]; + + return out_token; +} + +/** + * Initializes position ids based on attention mask and starting position + */ +void initialize_position_ids(ov::Tensor& position_ids, + const ov::Tensor& attention_mask, + int64_t start_pos) { + OPENVINO_ASSERT(position_ids.get_element_type() == ov::element::i64, + "position_ids tensor element type should be an i64"); + OPENVINO_ASSERT(position_ids.get_shape().size() == 2, + "position_ids tensor should of rank 2 with shape [batch_size, seq_len]"); + OPENVINO_ASSERT(attention_mask.get_element_type() == ov::element::i64, + "attention_mask tensor element type should be an i64"); + OPENVINO_ASSERT(attention_mask.get_shape().size() == 2, + "attention_mask tensor should of rank 2 with shape [batch_size, seq_len]"); + + const size_t batch_size = attention_mask.get_shape()[0]; + const size_t seq_length = attention_mask.get_shape()[1]; + + const int64_t* attention_mask_data = attention_mask.data(); + int64_t* position_ids_data = position_ids.data(); + + for (size_t batch = 0; batch < batch_size; batch++) { + size_t sum = start_pos; + for (size_t i = 0; i < seq_length; i++) { + const size_t element_offset = batch * seq_length + i; + position_ids_data[element_offset] = sum; + if (attention_mask_data[element_offset] == 1) { + sum += 1; + } + } + } +} + +void initialize_beam_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) { + request.set_tensor("input_ids", input_ids); + request.set_tensor("attention_mask", attention_mask); + + ov::Shape input_shape = input_ids.get_shape(); + + ov::Tensor position_ids = request.get_tensor("position_ids"); + position_ids.set_shape(input_shape); + initialize_position_ids(position_ids, attention_mask); + + ov::Tensor beam_idx = request.get_tensor("beam_idx"); + beam_idx.set_shape({input_shape.at(0)}); + std::fill_n(beam_idx.data(), input_shape.at(0), 0); +} + + +void set_attention_mask(ov::Tensor&& attention_mask, std::vector next_beams) { + ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; + ov::Shape original_shape = original_mask.get_shape(); + attention_mask.copy_to(original_mask); + + ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1}; + attention_mask.set_shape(new_shape); + + for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) { + const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1); + const size_t result_prompt_offset = beam_id * new_shape.at(1); + + int64_t* dest = attention_mask.data() + result_prompt_offset; + const int64_t* src = original_mask.data() + original_prompt_offset; + + std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t)); + attention_mask.data()[result_prompt_offset + new_shape.at(1) - 1] = 1; + } +} + +/** + * Set position ids tensor data for next token inference based on provided attention mask + * Supports multi batch + * Supports sparse attention_mask + */ +void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { + const size_t batch_size = attention_mask.get_shape().at(0); + const size_t atten_length = attention_mask.get_shape().at(1); + position_ids.set_shape({batch_size, 1}); + + for (size_t batch = 0; batch < batch_size; batch++) { + int64_t* start = attention_mask.data() + batch * atten_length; + // todo: be careful with start + atten_length, probably need to replace with start + atten_length -1 + position_ids.data()[batch] = std::accumulate(start, start + atten_length, 0); + } +} + +/** + * Get attention mask tensor for next token inference + * Supports multi batch + * Supports sparse attention_mask + */ +ov::Tensor extend_attention(ov::Tensor attention_mask) { + auto shape = attention_mask.get_shape(); + auto batch_size = shape[0]; + auto seq_len = shape[1]; + + ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}}; + auto old_data = attention_mask.data(); + auto new_data = new_atten_mask.data(); + for (size_t batch = 0; batch < batch_size; ++batch) { + std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t)); + new_data[batch * (seq_len + 1) + seq_len] = 1; + } + return new_atten_mask; +} + +} // namespace utils +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp new file mode 100644 index 0000000000..1684f71c96 --- /dev/null +++ b/src/cpp/src/utils.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +namespace ov { +namespace genai { +namespace utils { + +Tensor init_attention_mask(Tensor& position_ids); + +void print_tensor(const ov::Tensor& tensor); + +int64_t argmax(const ov::Tensor& logits, const size_t batch_idx); + +void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0); + +ov::Tensor extend_attention(ov::Tensor attention_mask); + +void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask); + +template +struct json_type_traits {}; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::number_integer; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::number_integer; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::number_unsigned; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::number_float; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::string; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::boolean; }; + +/// @brief reads value to param if T argument type is aligned with value stores in json +/// if types are not compatible leave param unchanged +template +void read_json_param(const nlohmann::json& data, const std::string& name, T& param) { + if (data.contains(name)) { + if constexpr (std::is_integral_v) { + if (data[name].is_number_integer() || data[name].is_number_unsigned()) { + param = data[name].get(); + } + } else if (data[name].type() == json_type_traits::json_value_t) { + param = data[name].get(); + } + } +} + +template +void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& param) { + if (config_map.count(name)) { + param = config_map.at(name).as(); + } +} + +std::tuple get_special_tokens_from_config_json(const std::filesystem::path& config_path); + +} // namespace utils +} // namespace genai +} // namespace ov diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt new file mode 100644 index 0000000000..1adeee111f --- /dev/null +++ b/src/python/CMakeLists.txt @@ -0,0 +1,51 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +include(FetchContent) +FetchContent_Declare( + pybind11 + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.12.0.tar.gz + URL_HASH SHA256=bf8f242abd1abcd375d516a7067490fb71abd79519a282d22b6e4d19282185a7 +) +FetchContent_GetProperties(pybind11) +if(NOT pybind11_POPULATED) + FetchContent_Populate(pybind11) + add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR}) +endif() + +pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp) +target_link_libraries(py_generate_pipeline PRIVATE openvino::genai nlohmann_json::nlohmann_json) +set_target_properties(py_generate_pipeline PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" +) +file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" DESTINATION "${CMAKE_BINARY_DIR}/openvino_genai/") +write_file("${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" "__version__ = \"${CMAKE_PROJECT_VERSION}\"") + +# setting RPATH / LC_RPATH depending on platform +if(LINUX) + # to find libopenvino_genai.so in the same folder + set(rpaths "$ORIGIN") +elseif(APPLE) + # to find libopenvino_genai.dylib in the same folder + set(rpaths "@loader_path") + if(DEFINED SKBUILD) + # in case we build pip package, we need to refer to libopenvino.dylib from 'openvino' package + list(APPEND rpaths "@loader_path/../openvino/libs") + endif() +endif() + +if(rpaths) + set_target_properties(py_generate_pipeline PROPERTIES INSTALL_RPATH "${rpaths}") +endif() + +find_package(Python3 REQUIRED COMPONENTS Interpreter Development) +install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__init__.py" "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) +install(TARGETS openvino_genai py_generate_pipeline LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) + +# wheel_genai component is used for wheel generation in pyproject.toml. +# Exclude wheel_genai from normal packaging because there's pygenai_X_Y component for that. +install(TARGETS openvino_genai py_generate_pipeline + LIBRARY DESTINATION . COMPONENT wheel_genai + RUNTIME DESTINATION . COMPONENT wheel_genai + EXCLUDE_FROM_ALL) diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py new file mode 100644 index 0000000000..1e3f0b393c --- /dev/null +++ b/src/python/openvino_genai/__init__.py @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import openvino # add_dll_directory for openvino lib +import os +from .__version__ import __version__ + + +if hasattr(os, "add_dll_directory"): + os.add_dll_directory(os.path.dirname(__file__)) + +from .py_generate_pipeline import LLMPipeline, Tokenizer, GenerationConfig, DecodedResults, EncodedResults, StreamerBase, StopCriteria + +__all__ = [ + 'LLMPipeline', + 'Tokenizer', + 'GenerationConfig', + 'DecodedResults', + 'EncodedResults', + 'StreamerBase', + 'StopCriteria' +] diff --git a/src/python/openvino_genai/__version__.py b/src/python/openvino_genai/__version__.py new file mode 100644 index 0000000000..472f83a46f --- /dev/null +++ b/src/python/openvino_genai/__version__.py @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Will be overwritten by pyproject.toml or cmake. +__version__ = "0.0.0.0" diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp new file mode 100644 index 0000000000..743d832099 --- /dev/null +++ b/src/python/py_generate_pipeline.cpp @@ -0,0 +1,264 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include "openvino/genai/llm_pipeline.hpp" + +namespace py = pybind11; +using ov::genai::LLMPipeline; +using ov::genai::Tokenizer; +using ov::genai::GenerationConfig; +using ov::genai::EncodedResults; +using ov::genai::DecodedResults; +using ov::genai::StopCriteria; +using ov::genai::StreamerBase; +using ov::genai::StreamerVariant; +using ov::genai::OptionalGenerationConfig; + +namespace { + +void update_config_from_kwargs(GenerationConfig& config, const py::kwargs& kwargs) { + if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast(); + if (kwargs.contains("max_length")) config.max_length = kwargs["max_length"].cast(); + if (kwargs.contains("ignore_eos")) config.ignore_eos = kwargs["ignore_eos"].cast(); + if (kwargs.contains("num_beam_groups")) config.num_beam_groups = kwargs["num_beam_groups"].cast(); + if (kwargs.contains("num_beams")) config.num_beams = kwargs["num_beams"].cast(); + if (kwargs.contains("diversity_penalty")) config.diversity_penalty = kwargs["diversity_penalty"].cast(); + if (kwargs.contains("length_penalty")) config.length_penalty = kwargs["length_penalty"].cast(); + if (kwargs.contains("num_return_sequences")) config.num_return_sequences = kwargs["num_return_sequences"].cast(); + if (kwargs.contains("no_repeat_ngram_size")) config.no_repeat_ngram_size = kwargs["no_repeat_ngram_size"].cast(); + if (kwargs.contains("stop_criteria")) config.stop_criteria = kwargs["stop_criteria"].cast(); + if (kwargs.contains("temperature")) config.temperature = kwargs["temperature"].cast(); + if (kwargs.contains("top_p")) config.top_p = kwargs["top_p"].cast(); + if (kwargs.contains("top_k")) config.top_k = kwargs["top_k"].cast(); + if (kwargs.contains("do_sample")) config.do_sample = kwargs["do_sample"].cast(); + if (kwargs.contains("repetition_penalty")) config.repetition_penalty = kwargs["repetition_penalty"].cast(); + if (kwargs.contains("eos_token_id")) config.eos_token_id = kwargs["eos_token_id"].cast(); +} + +py::object call_with_config(LLMPipeline& pipe, const std::string& text, const GenerationConfig& config, const StreamerVariant& streamer) { + if (config.num_return_sequences > 1) { + py::list res; + for (auto s: pipe.generate({text}, config, streamer).texts) { + PyObject* py_s = PyUnicode_DecodeUTF8(s.data(), s.length(), "replace"); + res.append(py_s); + } + return res; + } else { + auto res = std::string(pipe.generate(text, config, streamer)); + PyObject* py_str = PyUnicode_DecodeUTF8(res.data(), res.length(), "replace"); + return py::reinterpret_steal(py_str); + } +} + +std::vector call_with_config(LLMPipeline& pipe, const std::vector& text, const GenerationConfig& config, const StreamerVariant& streamer) { + return pipe.generate(text, config, streamer); +} + +std::vector call_with_kwargs(LLMPipeline& pipeline, const std::vector& texts, const py::kwargs& kwargs) { + GenerationConfig config = pipeline.get_generation_config(); + update_config_from_kwargs(config, kwargs); + return call_with_config(pipeline, texts, config, kwargs.contains("streamer") ? kwargs["streamer"].cast() : std::monostate()); +} + +py::object call_with_kwargs(LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { + // Create a new GenerationConfig instance and initialize from kwargs + GenerationConfig config = pipeline.get_generation_config(); + update_config_from_kwargs(config, kwargs); + return call_with_config(pipeline, text, config, kwargs.contains("streamer") ? kwargs["streamer"].cast() : std::monostate()); +} + +std::string ov_tokenizers_module_path() { + // Try a path relative to build artifacts folder first. + std::filesystem::path from_relative = ov::genai::tokenizers_relative_to_genai(); + if (std::filesystem::exists(from_relative)) { + return from_relative.string(); + } + return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path")); +} + +class EmptyStreamer: public StreamerBase { + // It's impossible to create an instance of pure virtual class. Define EmptyStreamer instead. + bool put(int64_t token) override { + PYBIND11_OVERRIDE_PURE( + bool, // Return type + StreamerBase, // Parent class + put, // Name of function in C++ (must match Python name) + token // Argument(s) + ); + } + void end() override { + PYBIND11_OVERRIDE_PURE(void, StreamerBase, end); + } +}; + +ov::InferRequest& get_request_from_pyobj(py::object obj) { + py::str obj_type = py::str(obj.get_type()); + // todo: InferRequest is not accessible from the outside. + // obj_type is openvino._pyopenvino.InferRequest, + // which is a pybind binding to InferRequestWrapper (InferRequest is in a m_request field of the latest) + // and the definition of InferRequestWrapper is not accessible from the outside. + + if (py::isinstance(obj)) { + // Directly return the casted object without copying + return obj.cast(); + } else { + throw std::invalid_argument("Provided object is not castable to ov::InferRequest"); + } +} + +} // namespace + + +PYBIND11_MODULE(py_generate_pipeline, m) { + m.doc() = "Pybind11 binding for LLM Pipeline"; + + py::class_(m, "LLMPipeline") + .def(py::init([](const std::string& model_path, const std::string& device) { + ov::genai::ScopedVar env_manager(ov_tokenizers_module_path()); + return std::make_unique(model_path, device); + }), + py::arg("model_path"), "path to the model path", + py::arg("device") = "CPU", "device on which inference will be done", + R"( + LLMPipeline class constructor. + model_path (str): Path to the model file. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + + .def(py::init(), + py::arg("model_path"), + py::arg("tokenizer"), + py::arg("device") = "CPU", + R"( + LLMPipeline class constructor for manualy created openvino_genai.Tokenizer. + model_path (str): Path to the model file. + tokenizer (openvino_genai.Tokenizer): tokenizer object. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + + .def(py::init([](py::object infer_request, + const Tokenizer& tokenizer, + OptionalGenerationConfig config) { + ov::genai::ScopedVar env_manager(ov_tokenizers_module_path()); + return std::make_unique(get_request_from_pyobj(infer_request), tokenizer, config); + }), + py::arg("infer_request"), "infer_request", + py::arg("tokenizer"), "openvino_genai.Tokenizer object", + py::arg("config"), "device on which inference will be done") + .def("generate", py::overload_cast(&call_with_kwargs), + R"( + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + ignore_eos: if set to true, then generation will not stop even if token is met. + eos_token_id: token_id of (end of sentence) + + Beam search specific parameters: + num_beams: number of beams for beam search. 1 disables beam search. + num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time. + length_penalty: exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while + `length_penalty` < 0.0 encourages shorter sequences. + num_return_sequences: the number of sequences to return for grouped beam search decoding. + no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an + "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + + Random sampling parameters: + temperature: the value used to modulate token probabilities for random sampling. + top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. + do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + )") + .def("generate", py::overload_cast&, + const py::kwargs&>(&call_with_kwargs)) + .def("generate", py::overload_cast&, + const GenerationConfig&, const StreamerVariant&>(&call_with_config)) + .def("generate", py::overload_cast(&call_with_config)) + + .def("__call__", py::overload_cast(&call_with_kwargs)) + .def("__call__", py::overload_cast(&call_with_config)) + + // todo: if input_ids is a ov::Tensor/numpy tensor + + .def("get_tokenizer", &LLMPipeline::get_tokenizer) + .def("start_chat", &LLMPipeline::start_chat) + .def("finish_chat", &LLMPipeline::finish_chat) + .def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &LLMPipeline::set_generation_config) + .def("apply_chat_template", &LLMPipeline::apply_chat_template); + + // Binding for Tokenizer + py::class_(m, "Tokenizer", + R"(openvino_genai.Tokenizer object is used to initialize Tokenizer + if it's located in a different path than the main model.)") + .def(py::init([](const std::string& tokenizer_path) { + ov::genai::ScopedVar env_manager(ov_tokenizers_module_path()); + return std::make_unique(tokenizer_path); + }), py::arg("tokenizer_path")) + .def("get_pad_token_id", &Tokenizer::get_pad_token_id) + .def("get_bos_token_id", &Tokenizer::get_bos_token_id) + .def("get_eos_token_id", &Tokenizer::get_eos_token_id) + .def("get_pad_token", &Tokenizer::get_pad_token) + .def("get_bos_token", &Tokenizer::get_bos_token) + .def("get_eos_token", &Tokenizer::get_eos_token); + + // Binding for StopCriteria + py::enum_(m, "StopCriteria", + R"(StopCriteria controls the stopping condition for grouped beam search. The following values are possible: + "EARLY" stops as soon as there are `num_beams` complete candidates. + "HEURISTIC" stops when is it unlikely to find better candidates. + "NEVER" stops when there cannot be better candidates.)") + .value("EARLY", StopCriteria::EARLY) + .value("HEURISTIC", StopCriteria::HEURISTIC) + .value("NEVER", StopCriteria::NEVER) + .export_values(); + + // Binding for GenerationConfig + py::class_(m, "GenerationConfig") + .def(py::init<>()) + .def(py::init()) + .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) + .def_readwrite("max_length", &GenerationConfig::max_length) + .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) + .def_readwrite("num_beam_groups", &GenerationConfig::num_beam_groups) + .def_readwrite("num_beams", &GenerationConfig::num_beams) + .def_readwrite("diversity_penalty", &GenerationConfig::diversity_penalty) + .def_readwrite("length_penalty", &GenerationConfig::length_penalty) + .def_readwrite("num_return_sequences", &GenerationConfig::num_return_sequences) + .def_readwrite("no_repeat_ngram_size", &GenerationConfig::no_repeat_ngram_size) + .def_readwrite("stop_criteria", &GenerationConfig::stop_criteria) + .def_readwrite("temperature", &GenerationConfig::temperature) + .def_readwrite("top_p", &GenerationConfig::top_p) + .def_readwrite("top_k", &GenerationConfig::top_k) + .def_readwrite("do_sample", &GenerationConfig::do_sample) + .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) + .def_readwrite("eos_token_id", &GenerationConfig::eos_token_id); + + py::class_(m, "DecodedResults") + .def(py::init<>()) + .def_readwrite("texts", &DecodedResults::texts) + .def_readwrite("scores", &DecodedResults::scores); + + py::class_(m, "EncodedResults") + .def(py::init<>()) + .def_readwrite("tokens", &EncodedResults::tokens) + .def_readwrite("scores", &EncodedResults::scores); + + py::class_>(m, "StreamerBase") // Change the holder form unique_ptr to shared_ptr + .def(py::init<>()) + .def("put", &StreamerBase::put) + .def("end", &StreamerBase::end); +} diff --git a/text_generation/causal_lm/cpp/stateful.jpg b/src/stateful.jpg similarity index 100% rename from text_generation/causal_lm/cpp/stateful.jpg rename to src/stateful.jpg diff --git a/text_generation/causal_lm/cpp/stateless.jpg b/src/stateless.jpg similarity index 100% rename from text_generation/causal_lm/cpp/stateless.jpg rename to src/stateless.jpg diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py new file mode 100644 index 0000000000..ce6ef67600 --- /dev/null +++ b/tests/python_tests/conftest.py @@ -0,0 +1,10 @@ +def pytest_make_parametrize_id(config, val, argname): + if argname in ['prompt', 'promtps']: + return f'{val}' + elif argname == 'model_descr': + return f"{val[0]}" + elif argname in ['stop_criteria', 'generation_config']: + return str(val) + elif isinstance(val, (int, float, str)): + return f'{argname}={val}' + return None diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py new file mode 100644 index 0000000000..514b2e5326 --- /dev/null +++ b/tests/python_tests/list_test_models.py @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pathlib + +def models_list(): + model_ids = [ + ("katuni4ka/tiny-random-phi3", "tiny-random-phi3"), + # ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0"), + # ("microsoft/phi-1_5", "phi-1_5/"), + + # ("google/gemma-2b-it", "gemma-2b-it"), + # ("google/gemma-7b-it", "gemma-7b-it"), + # ("meta-llama/Llama-2-7b-chat-hf", "Llama-2-7b-chat-hf"), + # ("meta-llama/Llama-2-13b-chat-hf", "Llama-2-13b-chat-hf"), + # ("openlm-research/open_llama_3b", "open_llama_3b"), + # ("openlm-research/open_llama_7b", "open_llama_7b"), + # ("databricks/dolly-v2-3b", "dolly-v2-3b"), + # ("databricks/dolly-v2-12b", "dolly-v2-12b"), + ] + import os + prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) + return [(model_id, prefix / model_path) for model_id, model_path in model_ids] + + +if __name__ == "__main__": + for model_id, model_path in models_list(): + print(model_id, model_path) diff --git a/tests/python_tests/pytest.ini b/tests/python_tests/pytest.ini new file mode 100644 index 0000000000..38a6279b5d --- /dev/null +++ b/tests/python_tests/pytest.ini @@ -0,0 +1,7 @@ +[pytest] + +markers = + precommit + nightly + +addopts = -m precommit diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt new file mode 100644 index 0000000000..fa7db3f2e8 --- /dev/null +++ b/tests/python_tests/requirements.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +optimum[openvino]==1.20.0 +pytest diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py new file mode 100644 index 0000000000..1d66348678 --- /dev/null +++ b/tests/python_tests/test_generate_api.py @@ -0,0 +1,553 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import functools +import openvino +import openvino_genai +import openvino_tokenizers +import optimum.intel +from openvino_genai import StopCriteria +import pytest +import transformers +from list_test_models import models_list +from typing import Union, List, Dict, Tuple +import sys +from pathlib import Path +import shutil +import json + +@functools.lru_cache(2) +def read_model(params): + model_id, path = params + tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + hf_model = transformers.AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True) + + if not (path / 'openvino_model.xml').is_file(): + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) + openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") + + # to store tokenizer config jsons with special tokens + tokenizer.save_pretrained(path) + hf_model.generation_config.save_pretrained(path) + hf_model.config.save_pretrained(path) + + optimum.intel.openvino.OVModelForCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=True, + compile=False, device='CPU', load_in_8bit=False + ).save_pretrained(path) + # Return AutoModelForCausalLM instead of OVModelForCausalLM because + # there's no way to disable mmap for now. That prohibits the same + # model from being opened twice at the same time. + return ( + model_id, + path, + tokenizer, + hf_model, + openvino_genai.LLMPipeline(str(path)), + ) + + +def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]): + model_id, path, tokenizer, model, pipe = model_descr + device = 'CPU' + config = generation_config.copy() # to avoid side effects + num_beams = config['num_beams'] if 'num_beams' in config else 1 + + if not isinstance(prompts, list): + prompts = [prompts] + + if 'do_sample' not in config: + # Some HF model has default do_sample = True, and if we test beam search + # it conflicts with `diversity_penalty` and/or `num_beam_groups`. + # Need to set exlicitly to False, but only if test arguments omitted this arg. + config['do_sample'] = False + + generation_config_hf = config.copy() + if generation_config_hf.get('stop_criteria'): + generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')] + generation_config_hf.pop('ignore_eos', None) + + # Encode the batch of prompts + tokenizer.padding_side = "left" + encoded_prompts = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True) + prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask'] + + generation_config_hf['num_return_sequences'] = num_beams + hf_encoded_outputs = model.generate(prompt_ids, attention_mask=attention_mask, + **generation_config_hf) + + hf_outputs = [] + for idx, hf_encoded_out in enumerate(hf_encoded_outputs): + prompt_count = idx // num_beams + hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True)) + + import openvino_genai as ov_genai + pipe = ov_genai.LLMPipeline(str(path), device) + + config['num_return_sequences'] = num_beams * len(prompts) + ov_outputs = pipe.generate(prompts, **config) + + hf_outputs.sort() + ov_outputs.sort() + for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)): + if hf_output != ov_output: + print(f'hf_output: {hf_output}') + print(f'ov_output: {ov_output}') + assert hf_output == ov_output + +def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt): + device = 'CPU' + model_id, path, tokenizer, model, pipe = model_descr + + config = generation_config.copy() # to avoid side effects + + if 'do_sample' not in config: + # Some HF model has default do_sample = True, and if we test beam search + # it conflicts with `diversity_penalty` and/or `num_beam_groups`. + # Need to set exlicitly to False, but only if test arguments omitted this arg. + config['do_sample'] = False + + generation_config_hf = config.copy() + if generation_config_hf.get('stop_criteria'): + generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')] + generation_config_hf.pop('ignore_eos', None) + + encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True) + hf_encoded_output = model.generate(encoded_prompt, **generation_config_hf) + hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) + + import openvino_genai as ov_genai + pipe = ov_genai.LLMPipeline(str(path), device) + + ov_output = pipe.generate(prompt, **config) + if config.get('num_return_sequences', 1) > 1: + assert hf_output in ov_output + else: + if hf_output != ov_output: + print(f'hf_output: {hf_output}') + print(f'ov_output: {ov_output}') + + assert hf_output == ov_output + + +def stop_criteria_map(): + # in OpenVINO GenAI this parameter is called stop_criteria, + # while in HF it's called early_stopping. + # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" + return { + StopCriteria.NEVER: "never", + StopCriteria.EARLY: True, + StopCriteria.HEURISTIC: False + } + + +test_cases = [ + (dict(max_new_tokens=20), 'table is made of'), + (dict(max_new_tokens=20), '你好! 你好嗎?'), + (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'), + (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), + (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), + (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'), + (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), +] +@pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.precommit +def test_decoding(model_descr, generation_config, prompt): + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + + +test_configs = [ + dict(max_new_tokens=20), + dict(max_new_tokens=200, ignore_eos=True), + dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0) +] +batched_prompts = [ + ['table is made of', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], + ['hello', 'Here is the longest nowel ever: '], + ['Alan Turing was a', 'return 0', '你好! 你好嗎?'] +] +@pytest.mark.parametrize("generation_config", test_configs) +@pytest.mark.parametrize("prompts", batched_prompts) +@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.precommit +@pytest.mark.xfail( + raises=AssertionError, reason="assert hf_output == ov_output fails", + strict=False, +) +def test_multibatch(model_descr, generation_config, prompts): + run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts) + + +prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of'] +@pytest.mark.parametrize("num_beam_groups", [2, 3, 8]) +@pytest.mark.parametrize("group_size", [5, 3, 10]) +@pytest.mark.parametrize("max_new_tokens", [20, 15]) +@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.precommit +def test_beam_search_decoding(model_descr, num_beam_groups, group_size, + max_new_tokens, diversity_penalty, prompt): + generation_config = dict( + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=diversity_penalty, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, + ) + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + + +@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("max_new_tokens", [10, 80]) +@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.precommit +def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): + # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence + # while genai ends sentence with + if (stop_criteria == StopCriteria.EARLY): + pytest.skip() + generation_config = dict( + num_beam_groups=2, + num_beams=2 * 3, + diversity_penalty=1.0, + num_return_sequences=2 * 3, + max_new_tokens=max_new_tokens, + stop_criteria=stop_criteria, + ) + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + + +# test long sequences +@pytest.mark.parametrize("num_beam_groups", [2]) +@pytest.mark.parametrize("group_size", [5]) +@pytest.mark.parametrize("max_new_tokens", [800, 2000]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.skip(reason="Will be enabled in nightly since the test are computationally expensive") +@pytest.mark.nightly +def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, + max_new_tokens, prompt): + generation_config = dict( + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=1.0, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, + ) + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + + +def user_defined_callback(subword): + print(subword) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +def test_callback_one_string(callback): + pipe = read_model(models_list()[0])[4] + generation_config = pipe.get_generation_config() + generation_config.max_new_tokens = 10 + pipe.generate('', generation_config, callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +def test_callback_batch_fail(callback): + pipe = read_model(models_list()[0])[4] + with pytest.raises(RuntimeError): + pipe.generate(['1', '2'], openvino_genai.GenerationConfig(), callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +def test_callback_kwargs_one_string(callback): + pipe = read_model(models_list()[0])[4] + pipe.generate('', max_new_tokens=10, streamer=callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +def test_callback_kwargs_batch_fail(callback): + pipe = read_model(models_list()[0])[4] + with pytest.raises(RuntimeError): + pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback) + + +class Printer(openvino_genai.StreamerBase): + def __init__(self, tokenizer): + super().__init__() + self.tokenizer = tokenizer + def put(self, token_id): + # print(self.tokenizer.decode([token_id])) # Incorrect way to print, but easy to implement + print(token_id) # print only token because self.tokenizer.decode([token_id]) are not implemented yet + def end(self): + print('end') + + +@pytest.mark.precommit +@pytest.mark.xfail( + raises=RuntimeError, + reason="resulting token is out of vocabulary range on Mac", + strict=False, + condition=sys.platform == "darwin" +) +def test_streamer_one_string(): + pipe = read_model(models_list()[0])[4] + generation_config = pipe.get_generation_config() + generation_config.max_new_tokens = 10 + printer = Printer(pipe.get_tokenizer()) + pipe.generate('', generation_config, printer) + + +@pytest.mark.precommit +def test_streamer_batch_fail(): + pipe = read_model(models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + with pytest.raises(RuntimeError): + pipe.generate(['1', '2'], openvino_genai.GenerationConfig(), printer) + + +@pytest.mark.precommit +def test_streamer_kwargs_one_string(): + pipe = read_model(models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + pipe.generate('', max_new_tokens=10, do_sample=True, streamer=printer) + + +@pytest.mark.precommit +def test_streamer_kwargs_batch_fail(): + pipe = read_model(models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + with pytest.raises(RuntimeError): + pipe.generate('', num_beams=2, streamer=printer) + + +@pytest.mark.precommit +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_operator_with_callback_one_string(callback): + pipe = read_model(models_list()[0])[4] + ten_tokens = pipe.get_generation_config() + ten_tokens.max_new_tokens = 10 + pipe('', ten_tokens, callback) + + +@pytest.mark.precommit +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_operator_with_callback_batch_fail(callback): + pipe = read_model(models_list()[0])[4] + with pytest.raises(TypeError): + pipe(['1', '2'], openvino_genai.GenerationConfig(), callback) + + +@pytest.mark.precommit +def test_operator_with_streamer_kwargs_one_string(): + pipe = read_model(models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + pipe('', max_new_tokens=10, do_sample=True, streamer=printer) + + +@pytest.mark.precommit +def test_operator_with_streamer_kwargs_batch_fail(): + pipe = read_model(models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + with pytest.raises(RuntimeError): + pipe('', num_beams=2, streamer=printer) + + +@pytest.fixture(scope="module") +def model_tmp_path(tmpdir_factory): + model_id, path, _, _, _ = read_model(models_list()[0]) + temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) + + # copy openvino converted model and tokenizers + for pattern in ['*.xml', '*.bin']: + for src_file in path.glob(pattern): + if src_file.is_file(): + shutil.copy(src_file, temp_path / src_file.name) + yield model_id, Path(temp_path) + + +# load Tokenizer where all configs are cleared +def load_tok(configs: List[Tuple], temp_path): + # remove existing jsons from previous tests + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + return openvino_genai.Tokenizer(str(temp_path)) + + +# load LLMPipline where all configs are cleared +def load_pipe(configs: List[Tuple], temp_path): + # remove existing jsons from previous tests + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + return openvino_genai.LLMPipeline(str(temp_path)) + +@pytest.mark.precommit +def test_load_special_tokens_ids_1(model_tmp_path): + # test when there is an available config.json + config_json = { + "pad_token_id": 422, + "bos_token_id": 42, + "eos_token_id": 37, + } + tok = load_tok([(config_json, "config.json")], model_tmp_path[1]) + assert tok.get_pad_token_id() == config_json['pad_token_id'] + assert tok.get_bos_token_id() == config_json['bos_token_id'] + assert tok.get_eos_token_id() == config_json['eos_token_id'] + + +@pytest.mark.precommit +def test_load_special_tokens_str_2(model_tmp_path): + # test with special_tokens_map + special_tokens_map_json = { + "pad_token": {"content": ""}, + "bos_token": {"content": ""}, + "eos_token": {"content": ""}, + } + tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1]) + assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"] + assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"] + assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"] + + +@pytest.mark.precommit +def test_load_special_tokens_3_(model_tmp_path): + # special_tokens_map is not available + # but tokenize_config.json exists + # will load both string and integer representations + tok_config_json = { + "added_tokens_decoder": { + "422": {"content": ""}, + "37": {"content": ""}, + "42": {"content": ""}, + }, + "pad_token": "", + "bos_token": "", + "eos_token": "", + } + + tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tmp_path[1]) + assert tok.get_pad_token() == tok_config_json['pad_token'] + assert tok.get_bos_token() == tok_config_json['bos_token'] + assert tok.get_eos_token() == tok_config_json['eos_token'] + + assert tok.get_pad_token_id() == 422 + assert tok.get_bos_token_id() == 37 + assert tok.get_eos_token_id() == 42 + + +@pytest.mark.precommit +def test_load_special_tokens_3(model_tmp_path): + # both config.json is availabel and tokenizer_config.json available + # check that it does not read int values from tokenizer_config.json if they are in config.json + tok_config_json = { + "added_tokens_decoder": { + # integers differ from config.json to check they don't override config.json + "777": {"content": ""}, + "888": {"content": ""}, + "656": {"content": ""}, + }, + "pad_token": "", + "bos_token": "", + "eos_token": "", + } + config_json = { + "pad_token_id": 422, + "bos_token_id": 42, + "eos_token_id": 37, + } + configs = [ + (tok_config_json, "tokenizer_config.json"), + (config_json, "config.json") + ] + tok = load_tok(configs, model_tmp_path[1]) + assert tok.get_pad_token_id() == config_json['pad_token_id'] + assert tok.get_bos_token_id() == config_json['bos_token_id'] + assert tok.get_eos_token_id() == config_json['eos_token_id'] + + assert tok.get_pad_token() == tok_config_json['pad_token'] + assert tok.get_bos_token() == tok_config_json['bos_token'] + assert tok.get_eos_token() == tok_config_json['eos_token'] + + +@pytest.mark.precommit +@pytest.mark.xfail( + raises=AssertionError, + reason="CVS-143410 ov tokenizer should be aligned with hf", + strict=False, +) +def test_load_special_tokens_4(model_tmp_path): + # only string representation is provided, find token integers by inference + model_id, temp_path = model_tmp_path + tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + + special_tokens_map_json = {} + token_str_int_map = {} + special_token_names = ['pad_token', 'bos_token', 'eos_token'] + for token_str in special_token_names: + if hasattr(tokenizer, token_str): + token_val = getattr(tokenizer, token_str) + special_tokens_map_json.update({token_str: {"content": token_val}}) + token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0] + token_str_int_map.update({token_str: token_id}) + + # since only string representations are present in the json will try to get by inference + tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], temp_path) + + # check ids inferred correctly for special tokens existing if HF tokenizer + if 'pad_token' in token_str_int_map: + assert tok.get_pad_token_id() == token_str_int_map['pad_token'] + if 'bos_token' in token_str_int_map: + assert tok.get_bos_token_id() == token_str_int_map['bos_token'] + if 'eos_token' in token_str_int_map: + assert tok.get_eos_token_id() == token_str_int_map['eos_token'] + + +invalid_configs = [ + dict(num_beam_groups=3, num_beams=15, do_sample=True), + dict(do_sample=True), # no eos_token_id no max_new_tokens, no max_len + dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos + dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty + dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp + dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p + dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k +] +@pytest.mark.parametrize("generation_config", invalid_configs) +@pytest.mark.precommit +def test_invalid_configs(model_tmp_path, generation_config): + model_id, temp_path = model_tmp_path + config_json = {} + pipe = load_pipe([(config_json, "config.json")], temp_path) + with pytest.raises(RuntimeError): + pipe.generate('blah blah', **generation_config) + + +@pytest.mark.precommit +def test_valid_configs(model_tmp_path): + model_id, temp_path = model_tmp_path + pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path) + + config = openvino_genai.GenerationConfig() + config.do_sample = True # no eos_token_id but it's loaded from config.json + pipe.set_generation_config(config) + +@pytest.mark.precommit +@pytest.mark.skipif(sys.platform.startswith("win"), reason="probably not enough space for this model on Win") +def test_unicode_pybind_decoding(): + # On this model this prompt generates unfinished utf string. + # Test that pybind will not fail. + model_id, path = ("microsoft/phi-1_5", Path("phi-1_5/")) + pipe = read_model((model_id, path))[4] + pipe.generate('你好! 你好嗎?', max_new_tokens=20) diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt deleted file mode 100644 index 6da39c6abe..0000000000 --- a/text_generation/causal_lm/cpp/CMakeLists.txt +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -cmake_minimum_required(VERSION 3.15) -project(causal_lm) - -add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/") - -add_executable(greedy_causal_lm greedy_causal_lm.cpp) -target_compile_definitions(greedy_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(greedy_causal_lm PRIVATE openvino::runtime) -set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) - -add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) -target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") -target_include_directories(beam_search_causal_lm PRIVATE ./) -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime) -set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) - -add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) -target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") -target_include_directories(speculative_decoding_lm PRIVATE ./) -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime) -set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON) -find_package(TBB REQUIRED COMPONENTS tbb) -target_link_libraries(speculative_decoding_lm PRIVATE TBB::tbb) - -add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) -target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") -target_include_directories(prompt_lookup_decoding_lm PRIVATE ./) -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime) -set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON) -find_package(TBB REQUIRED COMPONENTS tbb) -target_link_libraries(prompt_lookup_decoding_lm PRIVATE TBB::tbb) diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md deleted file mode 100644 index 08b91ab70e..0000000000 --- a/text_generation/causal_lm/cpp/README.md +++ /dev/null @@ -1,179 +0,0 @@ -# Text generation C++ samples that support most popular models like LLaMA 2 - -These examples showcase inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. [group_beam_searcher.hpp](group_beam_searcher.hpp) implements the algorithm of the same name, which is used by `beam_search_causal_lm`. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## How it works - -### Stateful LLM - -A common LLM inference optimisation is introduction of past KV (key/value)-cache. This cache is represented by the corresponding inputs and outputs in a model implemented originally in DL framework (e.g. PyTorch models from HuggingFace). To optimize it further and simplify usage, the model is transformed to a stateful form. This transformation improves inference performance and decreases amount of allocated runtime memory in long running text generation scenarios. It is achieved by hiding inputs and outputs of the model that represent past KV-cache tensors and handling them inside the model in a more efficient way. Although the cache is still accessible with state API. It is opposed to stateless model approach requiring manipulating these inputs and outputs explicitly. An introduction to stateful models can be found in https://docs.openvino.ai/2023.3/openvino_docs_OV_UG_stateful_models_intro.html. - -Hiding KV-cache introduces a peculiarity for beam search algorithm. Beam search suggests batched inference of multiple beams. The design described here so far would result in generating multiple independent sequences of tokens. Beam search algorithm, on the other hand, requires removing some of the ongoing beams and splitting other beams to multiple branches. Beam removal requires deleting corresponding KV-cache entry and beam splitting requires copying corresponding KV-cache values. - -To provide the possibility to implement beam search without accessing model's internal state, a stateful LLM converted with `optimum-intel` or [llm_bench](../../../llm_bench/python/) introduces an additional 1-dimentional `beam_idx` input. `beam_idx` must contain indexes of elements in a batch which are intended to be selected and will evolve during the next beam search iteration. There's only one beam when the generation starts. That beam corresponds to the initial prompt. `beam_idx` must have values: `[0, 0]` to keep the initial beam and introduce its copy. The dynamic batch size enables to change the number of beams dynamically. `beam_idx` must have `[1]` as the value to remove zeroth sequence and keep the second beam only. - -Assume there are two running beams. To proceed with generating both beams at the next iteration, `beam_idx` values must be `[0, 1]`, pointing to batch elements `0` and `1`. To drop the last beam and split the other beam in two, `beam_idx` must be set to `[0, 0]`. This results in utilizing only the part of KV cache corresponding to the zeroth element in the batch. The process of selecting proper entries in cache is called Cache Reorder. - -![](beam_idx-fork.gif) -![](beam_idx-drop.gif) - -The images below represent stateless and stateful LLM pipelines. The model has 4 inputs: -1. `input_ids` contains the next selected token -2. `attention_mask` is filled with `1` -3. `position_ids` encodes a position of currently generating token in the sequence -4. `beam_idx` selects beams - -The model has 1 output `logits` describing the predicted distribution over the next tokens. And there's KV cache state. - -![](stateless.jpg) -![](stateful.jpg) - -### greedy_causal_lm - -The program loads a tokenizer, a detokenizer and a model (`.xml` and `.bin`) to OpenVINO. A prompt is tokenized and passed to the model. The model greedily generates token by token until the special end of sequence (EOS) token is obtained. The predicted tokens are converted to chars and printed in a streaming fashion. - -### beam_search_causal_lm - -The program loads a tokenizer, a detokenizer and a model (`.xml` and `.bin`) to OpenVINO. A prompt is tokenized and passed to the model. The model predicts a distribution over the next tokens and group beam search samples from that distribution to explore possible sequesnses. The result is converted to chars and printed. - -### speculative_decoding_lm - -Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alonside with the main model. - -Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. - -This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In tha caste the are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf - -### prompt_lookup_decoding_lm - -[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. - -> [!NOTE] ->Models should belong to the same family and have same tokenizers. - -## Install OpenVINO - -Install [OpenVINO Archives >= 2024.1](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. - -## Install `libtbb-dev` on Linux - -> [!NOTE] -> `tbb` development files are installed with OpenVINO Archive on Windows and macOS. - -```sh -sudo apt-get install libtbb-dev -``` - -## Build `greedy_causal_lm`, `beam_search_causal_lm` and `openvino_tokenizers` - -### Linux/macOS - -```sh -git submodule update --init -source /setupvars.sh -cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ && cmake --build ./build/ -j -``` - -### Windows - -```bat -git submodule update --init -\setupvars.bat -cmake -S .\ -B .\build\ && cmake --build .\build\ --config Release -j -``` - -### Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -#### Linux/macOS - -```sh -source /setupvars.sh -python3 -m pip install --upgrade-strategy eager -r requirements.txt -# Update openvino_tokenizers from the submodule -python3 -m pip install ./../../../thirdparty/openvino_tokenizers/[transformers] -optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -#### Windows - -```bat -\setupvars.bat -python -m pip install --upgrade-strategy eager -r requirements.txt -REM Update openvino_tokenizers from the submodule -python -m pip install .\..\..\..\thirdparty\openvino_tokenizers\[transformers] -optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -### Usage: -1. `greedy_causal_lm ""` -2. `beam_search_causal_lm ""` -3. `speculative_decoding_lm ""` -4. `prompt_lookup_decoding_lm ""` - -### Examples: - -#### Linux/MacOS: -1. `./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"` -2. `./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"` -3. `./build/speculative_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ ./Llama-2-7b-chat-hf/ "Why is the Sun yellow?"` -4. `./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"` - -#### Windows: -1. `.\build\Release\greedy_causal_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"` -2. `.\build\Release\beam_search_causal_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"` -3. `.\build\Release\speculative_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\ .\Llama-2-7b-chat-hf\ "Why is the Sun yellow?"` -4. `.\build\Release\prompt_lookup_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"` - -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -## Supported models - -1. chatglm - 1. https://huggingface.co/THUDM/chatglm2-6b - refer to - [chatglm2-6b - AttributeError: can't set attribute](../../../llm_bench/python/doc/NOTES.md#chatglm2-6b---attributeerror-cant-set-attribute) - in case of `AttributeError` - 2. https://huggingface.co/THUDM/chatglm3-6b -2. LLaMA 2 (requires access request submission on its Hugging Face page to be downloaded) - 1. https://huggingface.co/meta-llama/Llama-2-13b-chat-hf - 2. https://huggingface.co/meta-llama/Llama-2-13b-hf - 3. https://huggingface.co/meta-llama/Llama-2-7b-chat-hf - 4. https://huggingface.co/meta-llama/Llama-2-7b-hf - 5. https://huggingface.co/meta-llama/Llama-2-70b-chat-hf - 6. https://huggingface.co/meta-llama/Llama-2-70b-hf -3. [Llama2-7b-WhoIsHarryPotter](https://huggingface.co/microsoft/Llama2-7b-WhoIsHarryPotter) -4. OpenLLaMA - 1. https://huggingface.co/openlm-research/open_llama_13b - 2. https://huggingface.co/openlm-research/open_llama_3b - 3. https://huggingface.co/openlm-research/open_llama_3b_v2 - 4. https://huggingface.co/openlm-research/open_llama_7b - 5. https://huggingface.co/openlm-research/open_llama_7b_v2 -5. [TinyLlama](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) -6. Qwen - 1. https://huggingface.co/Qwen/Qwen-7B-Chat - 2. https://huggingface.co/Qwen/Qwen-7B-Chat-Int4 - refer to - 3. https://huggingface.co/Qwen/Qwen1.5-7B-Chat - 4. https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4 - [Qwen-7B-Chat-Int4 - Torch not compiled with CUDA enabled](../../../llm_bench/python/doc/NOTES.md#qwen-7b-chat-int4---torch-not-compiled-with-cuda-enabled) - in case of `AssertionError` -7. Dolly - 1. https://huggingface.co/databricks/dolly-v2-3b -8. Phi - 1. https://huggingface.co/microsoft/phi-2 - 2. https://huggingface.co/microsoft/phi-1_5 -9. [notus-7b-v1](https://huggingface.co/argilla/notus-7b-v1) -10. [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) -11. [redpajama-3b-chat](https://huggingface.co/ikala/redpajama-3b-chat) -12. [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) -13. [Gemma-2B-it](https://huggingface.co/google/gemma-2b-it) - -This pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. - -Some models may require access request submission on their Hugging Face page to be downloaded. - -If https://huggingface.co/ is down, the conversion step won't be able to download the models. diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp deleted file mode 100644 index 110ac47178..0000000000 --- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp +++ /dev/null @@ -1,236 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include -#include - -namespace { - -enum SPECIAL_TOKEN { PAD_TOKEN = 2 }; - -std::string detokenize(ov::InferRequest& detokenizer, const std::vector& tokens) { - constexpr size_t BATCH_SIZE = 1; - ov::Tensor inp = detokenizer.get_input_tensor(); - inp.set_shape({BATCH_SIZE, tokens.size()}); - for (size_t idx = 0; idx < tokens.size(); ++idx) { - inp.data()[idx] = tokens.at(idx); - } - detokenizer.infer(); - return detokenizer.get_output_tensor().data()[0]; -} - -std::pair pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask) { - const size_t batch_size = input_ids.get_shape().at(0); - const size_t sequence_length = input_ids.get_shape().at(1); - int64_t* inputs_data = input_ids.data(); - int64_t* attention_mask_data = attention_mask.data(); - - for (size_t batch = 0; batch < batch_size; batch++) { - const size_t batch_offset = batch * sequence_length; - - // last token in the sequence is not a PAD_TOKEN, skipping - if (inputs_data[batch_offset + sequence_length - 1] != SPECIAL_TOKEN::PAD_TOKEN) { - continue; - } - - size_t pad_tokens_number = 0; - for (int i = sequence_length - 1; i >= 0; i--) { - const size_t token_offset = batch_offset + i; - - if (inputs_data[token_offset] == SPECIAL_TOKEN::PAD_TOKEN) { - continue; - } - - if (pad_tokens_number == 0) { - pad_tokens_number = sequence_length - i - 1; - } - - std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]); - std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]); - } - } - - return {input_ids, attention_mask}; -} - -std::pair tokenize(ov::InferRequest& tokenizer, std::vector prompts) { - tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); - - tokenizer.infer(); - - pad_left(tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")); - - // fix mask filled with '2' instead of '0' - ov::Tensor attention_mask = tokenizer.get_tensor("attention_mask"); - int64_t* attention_mask_data = attention_mask.data(); - std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0); - - return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; -} - -void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) { - const size_t batch_size = attention_mask.get_shape().at(0); - const size_t sequence_length = attention_mask.get_shape().at(1); - - const int64_t* attention_mask_data = attention_mask.data(); - int64_t* position_ids_data = position_ids.data(); - - for (size_t batch = 0; batch < batch_size; batch++) { - const size_t batch_offset = batch * sequence_length; - size_t sum = 0; - - for (size_t i = 0; i < sequence_length; i++) { - const size_t element_offset = batch_offset + i; - position_ids_data[element_offset] = sum; - if (attention_mask_data[element_offset] == 1) { - sum += 1; - } - } - } -} - -void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) { - request.set_tensor("input_ids", input_ids); - request.set_tensor("attention_mask", attention_mask); - - ov::Shape input_shape = input_ids.get_shape(); - - ov::Tensor position_ids = request.get_tensor("position_ids"); - position_ids.set_shape(input_shape); - initialize_position_ids(position_ids, attention_mask); - - ov::Tensor beam_idx = request.get_tensor("beam_idx"); - beam_idx.set_shape({input_shape.at(0)}); - std::fill_n(beam_idx.data(), input_shape.at(0), 0); -} - -void set_attention_mask(ov::Tensor&& attention_mask, std::vector next_beams) { - ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; - ov::Shape original_shape = original_mask.get_shape(); - attention_mask.copy_to(original_mask); - - ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1}; - attention_mask.set_shape(new_shape); - - for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) { - const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1); - const size_t result_prompt_offset = beam_id * new_shape.at(1); - - int64_t* dest = attention_mask.data() + result_prompt_offset; - const int64_t* src = original_mask.data() + original_prompt_offset; - - std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t)); - attention_mask.data()[result_prompt_offset + new_shape.at(1) - 1] = 1; - } -} - -void set_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { - const size_t batch_size = attention_mask.get_shape().at(0); - const size_t sequence_length = attention_mask.get_shape().at(1); - position_ids.set_shape({batch_size, 1}); - - for (size_t batch = 0; batch < batch_size; batch++) { - int64_t* mask_start = attention_mask.data() + batch * sequence_length; - position_ids.data()[batch] = std::accumulate(mask_start, mask_start + sequence_length - 1, 0); - } -} - -std::vector prompts_arguments_to_vector(int argc, char* argv[]) { - std::vector prompts; - prompts.reserve(argc - 2); - for (size_t i = 2; i < argc; i++) { - prompts.push_back(std::string{argv[i]}); - } - return prompts; -} - -} // namespace - -int main(int argc, char* argv[]) try { - if (argc < 3) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " '' ['' ...]"); - } - - // Compile models - ov::Core core; - core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - // Read the tokenizer model information from the file to later get the runtime information - auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); - // tokenizer and detokenizer work on CPU only - ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); - ov::InferRequest detokenizer = - core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - // The model can be compiled for GPU as well - ov::InferRequest lm = - core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); - - auto [input_ids, attention_mask] = tokenize(tokenizer, prompts_arguments_to_vector(argc, argv)); - - // Initialize beam search - const int64_t* prompt_data = input_ids.data(); - std::vector> prompts; - prompts.reserve(input_ids.get_shape().at(0)); - for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) { - size_t sequence_length = input_ids.get_shape().at(1); - size_t batch_offset = batch * sequence_length; - const int64_t* prompt_start = prompt_data + batch_offset; - prompts.push_back(std::vector{prompt_start, prompt_start + sequence_length}); - } - - // Get the runtime info from the tokenizer model that we read earlier - auto rt_info = tokenizer_model->get_rt_info(); // Get the runtime info for the model - int64_t SPECIAL_EOS_TOKEN; - - if (rt_info.count("eos_token_id") > 0) { // check if the runtime information has a valid EOS token ID - SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as(); - - } else { - throw std::runtime_error("EOS token ID not found in model's runtime information."); - } - - Parameters parameters{std::move(prompts), SPECIAL_EOS_TOKEN}; - GroupBeamSearcher group_beam_searcher{parameters}; - - initialize_inputs(input_ids, attention_mask, lm); - - std::vector next_tokens; - std::vector next_beams; - - for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) { - lm.infer(); - - std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); - if (next_tokens.empty()) { - break; - } - size_t batch_size = next_tokens.size(); - // Set pointers - lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()}); - lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()}); - // Set auxiliary inputs - set_attention_mask(lm.get_tensor("attention_mask"), next_beams); - set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask")); - } - - for (const std::vector>& prompt_group : finalize(std::move(group_beam_searcher))) { - std::cout << "Prompt:\n"; - for (const std::vector group : prompt_group) { - std::cout << "Group:\n"; - for (const Beam& beam : group) { - std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n'; - } - } - } - // Model is stateful which means that context (kv-cache) which belongs to a particular - // text sequence is accumulated inside the model during the generation loop above. - // This context should be reset before processing the next text sequence. - // While it is not required to reset context in this sample as only one batch of sequences is processed, - // it is called for education purposes: - lm.reset_state(); -} catch (const std::exception& error) { - std::cerr << error.what() << '\n'; - return EXIT_FAILURE; -} catch (...) { - std::cerr << "Non-exception object thrown\n"; - return EXIT_FAILURE; -} diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp deleted file mode 100644 index d75d32d0e0..0000000000 --- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include - -namespace { -std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { - constexpr size_t BATCH_SIZE = 1; - tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); - tokenizer.infer(); - return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; -} - -std::string detokenize(ov::InferRequest& detokenizer, std::vector& tokens) { - constexpr size_t BATCH_SIZE = 1; - detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()}); - detokenizer.infer(); - return detokenizer.get_output_tensor().data()[0]; -} - -// The following reasons require TextStreamer to keep a cache of previous tokens: -// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a", -// but detokenize(tokenize("prefix a")) == "prefix a" -// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�" -struct TextStreamer { - ov::InferRequest detokenizer; - std::vector token_cache; - size_t print_len = 0; - - void put(int64_t token) { - token_cache.push_back(token); - std::string text = detokenize(detokenizer, token_cache); - if (!text.empty() && '\n' == text.back()) { - // Flush the cache after the new line symbol - std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; - token_cache.clear(); - print_len = 0; - return; - } - if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { - // Don't print incomplete text - return; - } - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; - print_len = text.size(); - } - - void end() { - std::string text = detokenize(detokenizer, token_cache); - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; - token_cache.clear(); - print_len = 0; - } -}; -} - -int main(int argc, char* argv[]) try { - if (argc != 3) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); - } - // Compile models - ov::Core core; - core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - //Read the tokenizer model information from the file to later get the runtime information - auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); - // tokenizer and detokenizer work on CPU only - ov::InferRequest tokenizer = core.compile_model( - tokenizer_model, "CPU").create_infer_request(); - auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); - ov::InferRequest detokenizer = core.compile_model( - std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - // The model can be compiled for GPU as well - ov::InferRequest lm = core.compile_model( - std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); - auto seq_len = input_ids.get_size(); - - // Initialize inputs - lm.set_tensor("input_ids", input_ids); - lm.set_tensor("attention_mask", attention_mask); - ov::Tensor position_ids = lm.get_tensor("position_ids"); - position_ids.set_shape(input_ids.get_shape()); - std::iota(position_ids.data(), position_ids.data() + seq_len, 0); - constexpr size_t BATCH_SIZE = 1; - // Input values are persistent between inference calls. - // That allows to set values, which aren't going to change, only once - lm.get_tensor("beam_idx").set_shape({BATCH_SIZE}); - lm.get_tensor("beam_idx").data()[0] = 0; - lm.infer(); - size_t vocab_size = lm.get_tensor("logits").get_shape().back(); - float* logits = lm.get_tensor("logits").data() + (seq_len - 1) * vocab_size; - int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; - - lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1}); - position_ids.set_shape({BATCH_SIZE, 1}); - TextStreamer text_streamer{std::move(detokenizer)}; - - // Get the runtime info from the tokenizer model that we read earlier - auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model - int64_t SPECIAL_EOS_TOKEN; - - if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID - SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as(); - } else { - throw std::runtime_error("EOS token ID not found in model's runtime information."); - } - - int max_sequence_length = 100; - while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) { - ++seq_len; - lm.get_tensor("input_ids").data()[0] = out_token; - lm.get_tensor("attention_mask").set_shape({BATCH_SIZE, seq_len}); - std::fill_n(lm.get_tensor("attention_mask").data(), seq_len, 1); - position_ids.data()[0] = int64_t(seq_len - 1); - lm.start_async(); - text_streamer.put(out_token); - lm.wait(); - logits = lm.get_tensor("logits").data(); - out_token = std::max_element(logits, logits + vocab_size) - logits; - } - text_streamer.end(); - // Model is stateful which means that context (kv-cache) which belongs to a particular - // text sequence is accumulated inside the model during the generation loop above. - // This context should be reset before processing the next text sequence. - // While it is not required to reset context in this sample as only one sequence is processed, - // it is called for education purposes: - lm.reset_state(); -} catch (const std::exception& error) { - std::cerr << error.what() << '\n'; - return EXIT_FAILURE; -} catch (...) { - std::cerr << "Non-exception object thrown\n"; - return EXIT_FAILURE; -} diff --git a/third-party-programs.txt b/third-party-programs.txt new file mode 100644 index 0000000000..e418d7b5e3 --- /dev/null +++ b/third-party-programs.txt @@ -0,0 +1,417 @@ +OpenVINO GenAI Third Party Programs File + +This file contains the list of third party software ("third party programs") +contained in the Intel software and their required notices and/or license +terms. This third party software, even if included with the distribution of +the Intel software, may be governed by separate license terms, including +without limitation, third party license terms, other Intel software license +terms, and open source software license terms. These separate license terms +govern your use of the third party programs as set forth in the +"third-party-programs.txt" or other similarly-named text file. + +Third party programs and their corresponding required notices and/or license +terms are listed below. + +------------------------------------------------------------- + +Jinja2Cpp + +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. + +------------------------------------------------------------- + +JSON for Modern C++ (https://github.com/nlohmann/json) + +MIT License + +Copyright (c) 2013-2022 Niels Lohmann + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt new file mode 100644 index 0000000000..3e2f7deaf2 --- /dev/null +++ b/thirdparty/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +add_subdirectory(./openvino_tokenizers/ "${CMAKE_BINARY_DIR}/openvino_tokenizers/") +# Put binaries to a single dir to mimic package structure. +set_target_properties(openvino_tokenizers PROPERTIES + # Generator expressions to disable appending a per-configuration subdirectory (Release, Debug). + # ARCHIVE_OUTPUT is irrelevant. It's here just to keep all the artifacts in one place. + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" +) +if(TARGET core_tokenizers) + set_target_properties(core_tokenizers PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + ) +else() + # Prebuilt dependencies + if(WIN32) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/core_tokenizers.dll" + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icudt70.dll" + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icuuc70.dll") + elseif(LINUX) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so") + elseif(APPLE) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.dylib") + endif() + add_custom_command(OUTPUT "${extra_libs}" + COMMAND "${CMAKE_COMMAND}" -E copy "${extra_libs}" "${CMAKE_BINARY_DIR}/openvino_genai/" + DEPENDS openvino_tokenizers) +endif() diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index c754503462..e5cb83bc4f 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit c754503462f569b648b598d57ff91ea57bb8deb1 +Subproject commit e5cb83bc4fd246014f5d4cb0dfb6e2a3d1343dc3