Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port from master #1285

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
- 'tests/cpp/generate_config.cpp'
- 'tests/cpp/sampler.cpp'

- 'category: LoRA':
'category: LoRA':
- 'src/cpp/include/openvino/genai/lora_adapter.hpp'
- 'src/cpp/src/lora_adapter.cpp'
- 'src/cpp/src/lora_helper.cpp'
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ jobs:
- name: Test bindings (wheel)
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install . --verbose
python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"

genai_python_lib_whisper:
Expand Down Expand Up @@ -307,7 +307,7 @@ jobs:
- name: Test bindings (wheel)
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install . --verbose
python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"

genai_python_lib_vlm:
Expand Down
35 changes: 30 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,45 @@ if(POLICY CMP0169)
cmake_policy(SET CMP0169 OLD)
endif()

if(UNIX AND NOT (APPLE OR ANDROID OR CYGWIN))
set(LINUX ON)
endif()

project(OpenVINOGenAI
VERSION 2024.5.0.0
DESCRIPTION "OpenVINO GenAI"
HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai"
LANGUAGES CXX C)

if(NOT DEFINED Python3_FIND_VIRTUALENV)
set(Python3_FIND_VIRTUALENV FIRST)
endif()

# Looking for OpenVINO in the python distribution. It doesn't work for cross-compiling build
if(NOT CMAKE_CROSSCOMPILING)
find_package(Python3 REQUIRED)
execute_process(
COMMAND ${Python3_EXECUTABLE} -c "from openvino.utils import get_cmake_path; print(get_cmake_path(), end='')"
OUTPUT_VARIABLE OpenVINO_DIR_PY
ERROR_QUIET
)
endif()

# Find OpenVINODeveloperPackage first to compile with SDL flags
find_package(OpenVINODeveloperPackage ${OpenVINOGenAI_VERSION} QUIET
COMPONENTS Runtime Threading
PATHS "${OpenVINO_DIR}")
if(NOT OpenVINODeveloperPackage_FOUND)
find_package(OpenVINO ${OpenVINOGenAI_VERSION} REQUIRED
COMPONENTS Runtime Threading)
COMPONENTS Runtime Threading
PATHS "${OpenVINO_DIR_PY}")
endif()

include(cmake/features.cmake)

if(ENABLE_PYTHON)
# the following two calls are required for cross-compilation
if(OpenVINODeveloperPackage_DIR)
if(OpenVINODeveloperPackage_FOUND)
ov_find_python3(REQUIRED)
ov_detect_python_module_extension()
else()
Expand All @@ -62,9 +81,15 @@ endif()

add_subdirectory(thirdparty)
add_subdirectory(src)
add_subdirectory(samples)
add_subdirectory(tools/continuous_batching)
add_subdirectory(tests/cpp)
if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/samples")
add_subdirectory(samples)
endif()
if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/tools/continuous_batching")
add_subdirectory(tools/continuous_batching)
endif()
if(EXISTS "${OpenVINOGenAI_SOURCE_DIR}/tests/cpp")
add_subdirectory(tests/cpp)
endif()

install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
Expand Down
25 changes: 21 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,34 @@ optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --

### Run generation using VLMPipeline API in Python

See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/visual_language_chat) for a demo application.

Run the following command to download a sample image:

```sh
curl -O "https://storage.openvinotoolkit.org/test_data/images/dog.jpg"
```

```python
import numpy as np
import openvino as ov
import openvino_genai as ov_genai
#Will run model on CPU, GPU is a possible option
from PIL import Image

# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
pipe = ov_genai.VLMPipeline("./MiniCPM-V-2_6/", "CPU")
rgb = read_image("cat.jpg")
print(pipe.generate(prompt, image=rgb, max_new_tokens=100))

image = Image.open("dog.jpg")
image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
image_data = ov.Tensor(image_data)

prompt = "Can you describe the image?"
print(pipe.generate(prompt, image=image_data, max_new_tokens=100))
```

### Run generation using VLMPipeline in C++

Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details)
Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details). See [Visual Language Chat](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/visual_language_chat) for a demo application.

```cpp
#include "load_image.hpp"
Expand Down
29 changes: 23 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,31 @@ name = "openvino-genai"
version = "2024.5.0.0"
description = "Library of the most popular Generative AI model pipelines, optimized execution methods, and samples"
requires-python = ">=3.9"
readme = {file = "src/README.md", content-type="text/markdown"}
license = {text = "OSI Approved :: Apache Software License"}
readme = { file = "src/README.md", content-type="text/markdown" }
license = { "file" = "LICENSE" }
authors = [
{ name = "OpenVINO Developers", email = "[email protected]" },
]
classifiers = [
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: Apache Software License",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries :: Python Modules",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Operating System :: Unix",
"Operating System :: POSIX :: Linux",
"Operating System :: Microsoft :: Windows",
"Operating System :: MacOS",
"Programming Language :: C++",
"Programming Language :: C",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: Implementation :: CPython"
]
dependencies = [
"openvino_tokenizers~=2024.5.0.0.dev"
Expand All @@ -22,22 +37,24 @@ dependencies = [
directory = "src/python"

[tool.py-build-cmake.sdist]
exclude = ["tools", "samples", "tests", "thirdparty"]
include = ["CMakeLists.txt", "LICENSE", "third-party-programs.txt", "SECURITY.md", "cmake", "src", "thirdparty"]

[tool.py-build-cmake.cmake]
minimum_version = "3.23"
build_type = "Release"
config = ["Release"]
find_python3 = true
build_args = ["--parallel", "--target", "py_openvino_genai"]
build_args = ["--parallel", "--target", "py_openvino_genai_stub"]
install_args = ["--strip"]
install_components = ["wheel_genai"]
options = {"BUILD_TOKENIZERS" = "OFF"}

[build-system]
requires = [
"py-build-cmake@git+https://github.com/tttapa/py-build-cmake@7ab73da351c7140f06d727a8705bece4cf544cd9",
"cmake~=3.23"
"py-build-cmake==0.3.1",
"pybind11-stubgen==2.5.1",
"openvino~=2024.5.0.0.dev",
"cmake~=3.23.0"
]
build-backend = "py_build_cmake.build"

Expand Down
3 changes: 2 additions & 1 deletion requirements-build.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
cmake~=3.30
cmake~=3.23.0
pybind11-stubgen==2.5.1
15 changes: 15 additions & 0 deletions src/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,21 @@ else()
SOVERSION ${MAJOR_SUFFIX}${OpenVINOGenAI_VERSION_MINOR}${OpenVINOGenAI_VERSION_PATCH})
endif()

if(OpenVINODeveloperPackage_FOUND)
# must be called after all target_link_libraries
# ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME})

ov_ncc_naming_style(FOR_TARGET ${TARGET_NAME}
SOURCE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/include")

# TODO: override versions as currently they come from OpenVINO
# ov_add_vs_version_file(NAME ${TARGET_NAME}
# FILEDESCRIPTION "OpenVINO GenAI library")

# TODO: commit changes separately
# ov_add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
endif()

# - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
# - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
# - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`
Expand Down
13 changes: 13 additions & 0 deletions src/cpp/include/openvino/genai/visual_language/pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,19 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
const StreamerVariant& streamer
);

/// @brief Generate a response given a prompt and uint8 RGB image with [NHWC] or [HWC] layout.
/// @param prompt A prompt to respond to.
/// @param image Image to be prepended to a prompt.
/// @param generation_config A config to follow for text generation.
/// @param streamer A streamer to acquire intermediate result.
/// @return A string generated by a model.
DecodedResults generate(
const std::string& prompt,
const ov::Tensor& rgb,
const GenerationConfig& generation_config,
const StreamerVariant& streamer
);

/// @brief Generate a response given a prompt and config.
/// @param prompt A prompt to respond to.
/// @param config_map A config may contain GenerationConfig, values
Expand Down
20 changes: 14 additions & 6 deletions src/cpp/src/block_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

#include "sequence_group.hpp"


namespace ov::genai {

class KVCacheBlock {
Expand Down Expand Up @@ -188,7 +187,10 @@ class CacheStateDumper;
*/
class BlockAllocator {
std::vector<std::list<KVCacheBlock::Ptr>> m_free_blocks;
int m_total_num_blocks;
// We keep m_free_blocks_num instead of m_free_blocks[X].size() to WA old CXX library implementation issue for std::list::size()
// see https://stackoverflow.com/questions/13157164/why-isnt-stdlist-size-constant-time
std::vector<size_t> m_free_blocks_num;
size_t m_total_num_blocks;
friend class CacheStateDumper;
size_t m_num_layers;
bool m_enable_prefix_caching;
Expand All @@ -202,8 +204,8 @@ class BlockAllocator {
* @param num_layers The number of separate attention layers with KV caches in the LLM associated with the pipeline.
* Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache.
*/
BlockAllocator(int num_blocks, bool enable_prefix_caching, size_t num_layers = 1) :
m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
BlockAllocator(size_t num_blocks, bool enable_prefix_caching, size_t num_layers = 1) :
m_free_blocks_num(num_layers, num_blocks), m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero");
m_free_blocks.resize(m_num_layers);
for (auto& per_layer_block_list : m_free_blocks) {
Expand All @@ -224,7 +226,7 @@ class BlockAllocator {
* @return Number of free blocks for this layer.
*/
size_t num_free_blocks(size_t layer_idx) const {
return m_free_blocks[layer_idx].size() + m_overwriteable_blocks.num_blocks();
return m_free_blocks_num[layer_idx] + num_overwriteable_blocks();
}

/**
Expand Down Expand Up @@ -270,6 +272,7 @@ class BlockAllocator {
block_ptr->release();
if (block_ptr->is_free()) {
m_free_blocks[layer_idx].push_back(block_ptr);
++m_free_blocks_num[layer_idx];
}
}

Expand Down Expand Up @@ -325,6 +328,7 @@ class BlockAllocator {
// actual collision case
for (size_t layer_idx = 0; layer_idx < colliding_blocks_per_layer.size(); layer_idx++) {
m_free_blocks[layer_idx].push_back(colliding_blocks_per_layer[layer_idx]);
++m_free_blocks_num[layer_idx];
}
}
m_overwriteable_blocks.add(blocks_for_all_layers);
Expand All @@ -333,12 +337,14 @@ class BlockAllocator {
// TODO (vshampor): more fine-grained hash store control
for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) {
m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]);
++m_free_blocks_num[layer_idx];
}
}
}
else {
for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) {
m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]);
++m_free_blocks_num[layer_idx];
}
}
}
Expand Down Expand Up @@ -368,6 +374,7 @@ class BlockAllocator {
KVCacheBlock::Ptr allocated_block = m_free_blocks[layer_idx].front();
allocated_block->increment();
m_free_blocks[layer_idx].pop_front();
--m_free_blocks_num[layer_idx];
return allocated_block;
}

Expand All @@ -386,7 +393,7 @@ class BlockAllocator {
OPENVINO_ASSERT(m_enable_prefix_caching);
OPENVINO_ASSERT(can_allocate_blocks(1));

if (m_free_blocks[0].size() > 0) {
if (m_free_blocks_num[0] > 0) {
// allocate new empty block
BlocksPerLayer allocated_blocks;
allocated_blocks.reserve(m_num_layers);
Expand All @@ -396,6 +403,7 @@ class BlockAllocator {
allocated_block->set_hash(hash);
allocated_blocks.push_back(allocated_block);
m_free_blocks[i].pop_front();
--m_free_blocks_num[i];
}
cached_blocks[hash] = allocated_blocks;
return allocated_blocks;
Expand Down
22 changes: 2 additions & 20 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(

ov::Core core;

auto [core_properties, compile_properties] = utils::split_core_complile_config(properties);
auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
core.set_property(core_properties);

// The model can be compiled for GPU as well
Expand Down Expand Up @@ -57,7 +57,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
}

SchedulerConfig updated_config = scheduler_config;
// update KV number in scheduler config
// update KV blocks number in scheduler config
if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) {
updated_config.num_kv_blocks = device_config.get_num_kv_blocks();
}
Expand Down Expand Up @@ -166,24 +166,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
timer.start();
logits = m_model_runner->forward(m_requests, scheduler_output);
timer.end();

ov::InferRequest infer_request = m_model_runner->get_infer_request();
ov::CompiledModel compiled_model = infer_request.get_compiled_model();
const bool is_profiling_enabled = compiled_model.get_property(ov::enable_profiling);

// collect detailed statistic
if (is_profiling_enabled) {
std::vector<ov::ProfilingInfo> profiling_info = m_model_runner->get_infer_request().get_profiling_info();
for (const ov::ProfilingInfo& info : profiling_info) {
double current_time = info.real_time.count();
if (info.node_type == "PagedAttentionExtension") {
m_perf.m_paged_attention_time_ms += current_time;
} else if (info.node_type == "FullyConnected") {
m_perf.m_matmul_time_ms += current_time;
}
m_perf.m_infer_total_ms += current_time;
}
}
}

#ifdef DEBUG_CACHE_STATE_DUMP
Expand Down
Loading
Loading