diff --git a/samples/cpp/speculative_decoding/speculative_decoding_pipeline.hpp b/samples/cpp/speculative_decoding/speculative_decoding_pipeline.hpp index eed4bc5f39..217d0aa826 100644 --- a/samples/cpp/speculative_decoding/speculative_decoding_pipeline.hpp +++ b/samples/cpp/speculative_decoding/speculative_decoding_pipeline.hpp @@ -30,14 +30,15 @@ class SpeculativeDecodingPipeline { assisting_scheduler_config.cache_size = assisted_cache_size; } model_pipeline = ContinuousBatchingPipeline(models_path, model_scheduler_config, device, plugin_config); - // assisting_pipeline = ContinuousBatchingPipeline(assisting_model_path, assisting_scheduler_config, device, plugin_config); - assisting_pipeline = ContinuousBatchingPipeline(models_path, assisting_scheduler_config, device, plugin_config); + assisting_pipeline = ContinuousBatchingPipeline(assisting_model_path, assisting_scheduler_config, device, plugin_config); + // assisting_pipeline = ContinuousBatchingPipeline(models_path, assisting_scheduler_config, device, plugin_config); // assisting_pipeline.set_to_free_sequences(false); m_tokenizer = std::make_shared(models_path); } void step() { ContinuousBatchingPipeline::GeneratedTokensMap candidate_sequences; + std::cout << "K: " << k << std::endl; if (is_speculative_mode) { // generate candidates using small model for (size_t i = 0; i < k; ++i) { diff --git a/src/cpp/continuous_batching/src/block_manager.hpp b/src/cpp/continuous_batching/src/block_manager.hpp index 0d61479609..773763a414 100644 --- a/src/cpp/continuous_batching/src/block_manager.hpp +++ b/src/cpp/continuous_batching/src/block_manager.hpp @@ -240,6 +240,11 @@ class BlockManager { OPENVINO_ASSERT(can_allocate_blocks(num_logical_blocks - num_physical_blocks)); allocate(seq_id, num_logical_blocks - num_physical_blocks); } else { + // todo: iefode + if (num_physical_blocks > num_logical_blocks) { + free_sequence_partially(seq_id, num_physical_blocks - num_logical_blocks); + num_physical_blocks = block_table.size(); + } OPENVINO_ASSERT(num_logical_blocks == num_physical_blocks, "A number of physical and logic blocks must be the same in this code path"); KVCacheBlock::Ptr last_block = block_table.back();