Skip to content

Commit

Permalink
Merge pull request #8 from ilya-lavrenov/new-kv-cache-shape-approach
Browse files Browse the repository at this point in the history
New shape approach
  • Loading branch information
ilya-lavrenov authored May 13, 2024
2 parents 46e2b89 + fb73eb5 commit d2aebd9
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 78 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,21 @@

#include "openvino/runtime/tensor.hpp"

#include "model_config.hpp"
#include "device_config.hpp"

class CacheManager {
ModelConfig m_model_config;
DeviceConfig m_device_config;
std::vector<ov::Tensor> m_key_cache;
std::vector<ov::Tensor> m_value_cache;

public:
CacheManager(const ModelConfig& model_config, const DeviceConfig& device_config) :
m_model_config(model_config),
explicit CacheManager(const DeviceConfig& device_config) :
m_device_config(device_config) {
m_key_cache.reserve(m_model_config.get_num_layers());
m_value_cache.reserve(m_model_config.get_num_layers());
m_key_cache.reserve(m_device_config.get_num_layers());
m_value_cache.reserve(m_device_config.get_num_layers());

// Allocate KV caches
for (size_t decoder_layer_id = 0; decoder_layer_id < model_config.get_num_layers(); ++decoder_layer_id) {
for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape());
ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape());

Expand All @@ -33,10 +30,6 @@ class CacheManager {
}
}

size_t get_num_layers() const {
return m_key_cache.size();
}

ov::Tensor get_key_cache(size_t decoder_layer_id) const {
OPENVINO_ASSERT(decoder_layer_id < m_key_cache.size());
return m_key_cache[decoder_layer_id];
Expand Down Expand Up @@ -71,7 +64,7 @@ class CacheManager {
key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1;
value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1;

for (size_t decoder_layer_id = 0; decoder_layer_id < m_model_config.get_num_layers(); ++decoder_layer_id) {
for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
ov::Tensor key_src_cache_roi(m_key_cache[decoder_layer_id], key_src_start_roi, key_src_end_roi);
ov::Tensor key_dst_cache_roi(m_key_cache[decoder_layer_id], key_dst_start_roi, key_dst_end_roi);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
#include "model_runner.hpp"
#include "scheduler.hpp"
#include "timer.hpp"
#include "model_config.hpp"
#include "model_config.hpp"
#include "tokenizer.hpp"

#include "debug_utils.hpp"
Expand Down Expand Up @@ -41,8 +39,7 @@ GenerationResult from_sequence_group(std::shared_ptr<Tokenizer> tokenizer, Seque

} // namespace

void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model,
const ModelConfig& model_config, const DeviceConfig& device_config);
void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);

class ContinuousBatchingPipeline::Impl {
std::shared_ptr<Tokenizer> m_tokenizer;
Expand Down Expand Up @@ -84,17 +81,16 @@ class ContinuousBatchingPipeline::Impl {

// The model can be compiled for GPU as well
std::shared_ptr<ov::Model> model = core.read_model(models_path + "/openvino_model.xml");
ModelConfig model_config(model);

const std::string device = "CPU";
DeviceConfig device_config(core, scheduler_config, model_config, device);
DeviceConfig device_config(core, scheduler_config, device);

apply_paged_attention_transformations(model, model_config, device_config);
apply_paged_attention_transformations(model, device_config);
ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), ov::enable_profiling(true)).create_infer_request();

// setup KV caches
m_cache_manager = std::make_shared<CacheManager>(model_config, device_config);
for (size_t decoder_layer_id = 0; decoder_layer_id < model_config.get_num_layers(); ++decoder_layer_id) {
m_cache_manager = std::make_shared<CacheManager>(device_config);
for (size_t decoder_layer_id = 0; decoder_layer_id < device_config.get_num_layers(); ++decoder_layer_id) {
infer_request.set_input_tensor(2 + decoder_layer_id * 2, m_cache_manager->get_key_cache(decoder_layer_id));
infer_request.set_input_tensor(2 + decoder_layer_id * 2 + 1, m_cache_manager->get_value_cache(decoder_layer_id));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,47 @@
#pragma once

#include "openvino/runtime/core.hpp"
#include "openvino/core/shape.hpp"
#include "openvino/core/type/element_type.hpp"

#include "model_config.hpp"
#include "scheduler_config.hpp"

class DeviceConfig {
ov::element::Type m_kv_cache_type;
ov::Shape m_key_cache_shape;
ov::Shape m_value_cache_shape;
ov::Shape m_key_cache_shape, m_value_cache_shape;
ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers;
size_t m_num_kv_blocks, m_block_size;
std::string m_device;

public:
DeviceConfig(ov::Core& core, const SchedulerConfig& scheduling_config, const ModelConfig& model_config, const std::string& device) {
DeviceConfig(ov::Core& core, const SchedulerConfig& scheduling_config, const std::string& device) {
m_device = device;

// keep information about blocsk
m_num_kv_blocks = scheduling_config.num_kv_blocks;
m_block_size = scheduling_config.block_size;

if (m_device == "CPU") {
auto inference_precision = core.get_property(device, ov::hint::inference_precision);
m_kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
m_key_cache_shape = m_value_cache_shape = ov::Shape{scheduling_config.num_kv_blocks,
model_config.get_num_kv_heads(),
scheduling_config.block_size,
model_config.get_head_size()};
} else if (m_device == "GPU") {
OPENVINO_ASSERT("GPU is not currently supported. Please, remove this assert and fill configuration");
} else {
OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching");
}
}

void set_model_params(size_t num_kv_heads, size_t head_size, size_t num_decoder_layers) {
m_num_kv_heads = num_kv_heads;
m_head_size = head_size;
m_num_decoder_layers = num_decoder_layers;

m_key_cache_shape = m_value_cache_shape = ov::Shape{m_num_kv_blocks,
m_num_kv_heads,
m_block_size,
m_head_size};
}

std::string get_device() const {
return m_device;
}
Expand All @@ -41,11 +53,17 @@ class DeviceConfig {
return m_kv_cache_type;
}

size_t get_num_layers() const {
return m_num_decoder_layers;
}

ov::Shape get_key_cache_shape() const {
OPENVINO_ASSERT(!m_key_cache_shape.empty());
return m_key_cache_shape;
}

ov::Shape get_value_cache_shape() const {
OPENVINO_ASSERT(!m_value_cache_shape.empty());
return m_value_cache_shape;
}
};

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,38 @@
#include "openvino/pass/manager.hpp"
#include "openvino/pass/sdpa_to_paged_attention.hpp"

#include "model_config.hpp"
#include "device_config.hpp"

void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, const ModelConfig& model_config, const DeviceConfig& device_config) {
void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config) {
const ov::op::util::VariableVector& variables = model->get_variables();
OPENVINO_ASSERT(!variables.empty(), "Model is supposed to be stateful");

// number of variables is 2 (K and V) multiplied by number of decoder layers
size_t num_layers = variables.size() >> 1;

ov::pass::Manager manager;
manager.register_pass<ov::pass::SDPAToPagedAttention>();
manager.run_passes(model);

const ov::ParameterVector& parameters = model->get_parameters();
for (size_t decoder_layer_id = 0; decoder_layer_id < model_config.get_num_layers(); ++decoder_layer_id) {
parameters[2 + 2 * decoder_layer_id]->set_element_type(device_config.get_cache_precision());
parameters[2 + 2 * decoder_layer_id + 1]->set_element_type(device_config.get_cache_precision());
parameters[2 + 2 * decoder_layer_id]->set_partial_shape(device_config.get_key_cache_shape());
parameters[2 + 2 * decoder_layer_id + 1]->set_partial_shape(device_config.get_value_cache_shape());

for (auto param : parameters) {
std::cout << param->get_friendly_name() << " " << param->get_partial_shape() << std::endl;
}

// extract num_kv_heads and head_size
size_t kv_caches_inputs_offset = 2;
ov::PartialShape k_shape = parameters[kv_caches_inputs_offset]->get_partial_shape();
OPENVINO_ASSERT(k_shape.rank().get_length() == 3, "KV cache shape is expected to have rank 3, while shape is ", k_shape);
size_t num_kv_heads = k_shape[1].get_length(), head_size = k_shape[2].get_length();

device_config.set_model_params(num_kv_heads, head_size, num_layers);

for (size_t decoder_layer_id = 0; decoder_layer_id < num_layers; ++decoder_layer_id) {
parameters[kv_caches_inputs_offset + 2 * decoder_layer_id]->set_element_type(device_config.get_cache_precision());
parameters[kv_caches_inputs_offset + 2 * decoder_layer_id + 1]->set_element_type(device_config.get_cache_precision());
parameters[kv_caches_inputs_offset + 2 * decoder_layer_id]->set_partial_shape(device_config.get_key_cache_shape());
parameters[kv_caches_inputs_offset + 2 * decoder_layer_id + 1]->set_partial_shape(device_config.get_value_cache_shape());
}
model->validate_nodes_and_infer_types();
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include <cstdlib>
#include <vector>

#include "model_config.hpp"
#include "block_manager.hpp"
#include "sequence_group.hpp"
#include "block_manager.hpp"
Expand Down

0 comments on commit d2aebd9

Please sign in to comment.