From 870bb7979169fc5d476b7397429a716ccefcada7 Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Fri, 29 Nov 2024 16:53:32 +0000 Subject: [PATCH] NPUW: Unfold infer requests (#27319) (#27825) Chery-picked PR: https://github.com/openvinotoolkit/openvino/pull/27319 Co-authored-by: Dmitry Matveev --- .../src/al/include/intel_npu/config/npuw.hpp | 1 + .../intel_npu/npuw_private_properties.hpp | 8 + .../intel_npu/src/al/src/config/npuw.cpp | 1 + .../plugin/npuw/base_sync_infer_request.cpp | 291 ++++++++++++++++- .../plugin/npuw/base_sync_infer_request.hpp | 37 ++- .../src/plugin/npuw/compiled_model.cpp | 97 ++++-- .../src/plugin/npuw/compiled_model.hpp | 14 +- .../plugin/npuw/just_sync_infer_request.cpp | 308 ++---------------- .../plugin/npuw/just_sync_infer_request.hpp | 34 +- .../plugin/npuw/unfold_sync_infer_request.cpp | 140 ++++++++ .../plugin/npuw/unfold_sync_infer_request.hpp | 42 +++ 11 files changed, 621 insertions(+), 352 deletions(-) create mode 100644 src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp create mode 100644 src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index bdda589a7bcb7b..6d865ad5e4edf3 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -57,6 +57,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime); DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime); DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime); DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime); +DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime); DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime); DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime); DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index 648bcde0cdc913..af4a17988f451e 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -287,6 +287,14 @@ static constexpr ov::Property parallel_compilation{"NPUW_PARALLEL_COMPILE" */ static constexpr ov::Property funcall_async{"NPUW_FUNCALL_ASYNC"}; +/** + * @brief + * Type: boolean + * Create individual infer requests for partitiongs, even repeating. + * Default value: false. + */ +static constexpr ov::Property unfold_ireqs{"NPUW_UNFOLD_IREQS"}; + namespace accuracy { /** * @brief diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 84ac94d1f7c67b..0c7978845c690c 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -39,6 +39,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp index 216b1a35b4315c..77d000cb415de7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp @@ -7,6 +7,7 @@ #include "compiled_model.hpp" #include "intel_npu/config/npuw.hpp" #include "logging.hpp" +#include "openvino/core/parallel.hpp" #include "util.hpp" ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr& compiled_model) @@ -58,12 +59,8 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re LOG_INFO("- Trying next device..."); comp_model_desc.device_it++; can_try_again = m_npuw_model->compile_for_success(id); - if (can_try_again) { - if (recompiled) - *recompiled = true; - // Probably shouldn't be called all the time, but only if - // I/O submodel is affected - m_npuw_model->reset_io(); + if (can_try_again && recompiled) { + *recompiled = true; } } } // while(!new_ireq && can_try_again) @@ -178,6 +175,33 @@ void ov::npuw::IBaseInferRequest::check_tensors() const { return; } +std::vector> ov::npuw::IBaseInferRequest::query_state() const { + std::vector> variable_states = {}; + for (const auto& request : m_subrequests) { + if (!request) // optimized out + continue; + for (auto&& state : request->query_state()) { + if (!state._so) + state._so = request._so; + variable_states.emplace_back(state); + } + } + return variable_states; +} + +std::vector ov::npuw::IBaseInferRequest::get_profiling_info() const { + std::vector info; + for (size_t i = 0; i < m_subrequests.size(); ++i) { + if (!m_subrequests[i]) // optimized out + continue; + auto&& subreq_info = m_subrequests[i]->get_profiling_info(); + for (auto&& rec : subreq_info) + rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name; + info.insert(info.end(), subreq_info.begin(), subreq_info.end()); + } + return info; +} + void ov::npuw::IBaseInferRequest::infer() { m_now_idx.reset(); prepare_for_infer(); @@ -209,6 +233,261 @@ void ov::npuw::IBaseInferRequest::infer() { m_now_idx.reset(); } +std::size_t ov::npuw::IBaseInferRequest::total_subrequests() const { + return m_subrequests.size(); +} + +ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocMem(const ov::element::Type type, + const ov::Shape& shape, + const std::string& device) { + if (device == "CPU" || ov::shape_size(shape) == 0) { + return ov::get_tensor_impl(ov::Tensor(type, shape)); + } + + auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr; + auto remote_tensor = remote_ctx->create_host_tensor(type, shape); + return ov::get_tensor_impl(ov::make_tensor(remote_tensor)); +} + +ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocOut(const ov::Output& node, + const std::string& device) { + return allocMem(node.get_element_type(), node.get_shape(), device); +} + +void ov::npuw::IBaseInferRequest::alloc_io() { + // Preallocate input tensors + LOG_INFO("Preallocating input tensors..."); + for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) { + const auto& port = m_npuw_model->inputs()[i]; + ov::SoPtr allocated = allocOut(port, m_npuw_model->global_mem_device()); + m_input_tensors.push_back(allocated); + m_input_allocated.insert(allocated->data()); + m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true}; + } // for(inputs) + + // Preallocate output tensors + LOG_INFO("Preallocating output tensors..."); + for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { + LOG_BLOCK(); + const auto& port = m_npuw_model->outputs()[i]; + LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port); + + // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom + const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i); + LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second); + + auto tensor = alloc_global_out(i); + m_output_tensors.push_back(tensor); + m_port_to_tensor[port] = TensorStorage{tensor, true}; + } +} + +ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) { + const auto& port = m_npuw_model->outputs().at(out_idx); + return allocOut(port, m_npuw_model->global_mem_device()); +} + +void ov::npuw::IBaseInferRequest::init_gio() { + // Build the parameter/result mapping + m_subrequests_gio.resize(m_subrequests.size()); + + // Parameters: stage 1... + for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) { + const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i); + if (to_submodel != CompiledModel::NO_LINK) { + std::size_t sub_idx{}, in_idx{}; + std::tie(sub_idx, in_idx) = to_submodel; + m_subrequests_gio.at(sub_idx).global_params[i] = in_idx; + } + } // for(inputs) + + // Parameters: stage 2... + for (auto&& it : m_npuw_model->m_param_subscribers) { + const auto param_idx = it.first; + for (auto&& to_submodel : it.second) { + std::size_t sub_idx{}, in_idx{}; + std::tie(sub_idx, in_idx) = to_submodel; + m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx; + } + } + + // Results + for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { + std::size_t sub_idx{}, out_idx{}; + std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i); + m_subrequests_gio.at(sub_idx).global_results[i] = out_idx; + } +} + +void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request) { + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; + + NPUW_ASSERT(comp_model_desc.replaced_by); + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx]; + + // Bind extra parameters from the function's closure + // First, do easy things & delay heavy stuff + std::vector closure_unpack_required; + std::vector closure_copy_required; + + for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) { + auto& closure = comp_model_desc.closure[cidx]; + const auto closure_param_id = comp_model_desc.param_base + cidx; + + if (m_npuw_model->is_gather_closure(idx, cidx)) { + // No need to set/copy the host_gather's closure tensor int + // the subrequest - it is just a dummy. host_gather writes + // to the right buffer directly. + continue; + } + + auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; + if (m_npuw_model->unpack_required(idx, cidx)) { + // Remember where the unpack is required + closure_unpack_required.push_back(cidx); + } else { + if (needs_copy(idx, cidx)) { + // Remember where copy is requried + closure_copy_required.push_back(cidx); + } else { + // Easy case, just set one to another + request->set_tensor(iport, ov::get_tensor_impl(closure)); + } + } + } // for(closure) + + // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){ + ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) { + auto cidx = closure_copy_required[j]; + auto& closure = comp_model_desc.closure[cidx]; + const auto closure_param_id = comp_model_desc.param_base + cidx; + auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; + auto clparam = request->get_tensor(iport); + ov::get_tensor_impl(closure)->copy_to(clparam._ptr); + }); + // }); // ms_to_run + + for (std::size_t j = 0; j != closure_unpack_required.size(); j++) { + // NB: No need to protect anything here as containers are all + // preallocated and we only access elements under particular (thread + // -local) indices. + auto cidx = closure_unpack_required[j]; + + // FIXME: zerops are stored with absolute indexing, this needs to be aligned + auto& closure = comp_model_desc.closure[cidx]; + + const auto closure_param_id = comp_model_desc.param_base + cidx; + auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; + auto clparam = request->get_tensor(iport); + + if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) { + // Unpacking this weight requires scaling with zero points... + ov::npuw::util::unpack(ov::get_tensor_impl(closure), + ov::get_tensor_impl(comp_model_desc.zerops[cidx]), + ov::get_tensor_impl(comp_model_desc.scales[cidx]), + clparam); + } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) { + // Unpacking this weight requires scaling + ov::npuw::util::unpack(ov::get_tensor_impl(closure), + ov::get_tensor_impl(comp_model_desc.scales[cidx]), + clparam); + } else { + // Unpacking this weight doesn't require scaling + ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam); + } + } +} + +void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr request) { + LOG_DEBUG("Binding parameters for Subgraph[" << idx << "]"); + LOG_BLOCK(); + + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; + const auto real_idx = comp_model_desc.replaced_by.value_or(idx); + + const bool do_copy = needs_copy(idx); + const auto& iodesc = m_subrequests_gio.at(idx); + + const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + const bool is_spatial = proto_comp_model_desc.spatial.has_value(); + + // a list of ports to copy tensors, if needed: FROM -> TO + std::vector, ov::Output>> copy_list; + + // Check if the given subgraph's input is spatial + auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool { + if (!is_spatial) { + return false; // Early return + } + auto& spatial = proto_comp_model_desc.spatial.value(); + return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool { + return p.idx == sub_in_idx; + }); + }; + + for (auto&& it : iodesc.global_params) { + std::size_t param_idx{}, sub_in_idx{}; + std::tie(param_idx, sub_in_idx) = it; + LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl); + + const auto& g_port = m_npuw_model->inputs()[param_idx]; + const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor; + const auto& s_port = request->get_inputs()[sub_in_idx]; + LOG_DEBUG("Processing " << g_port << " -> " << s_port << "..."); + LOG_BLOCK(); + if (!is_spatial_param(sub_in_idx)) { + // Input parameter is non-spatial, do normal handling + if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) { + LOG_DEBUG("Will be copied"); + copy_list.emplace_back(g_tnsr, s_port); + } else { + LOG_DEBUG("Will be set"); + request->set_tensor(s_port, g_tnsr); + } + } else { + // Register for future use + m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr; + } + } + + LOG_DEBUG("Running copy..."); + ov::parallel_for(copy_list.size(), [&](std::size_t idx) { + auto& it = copy_list[idx]; + ov::SoPtr dst = request->get_tensor(it.second); + it.first->copy_to(dst._ptr); + }); + + // Run host-side gather, if required + if (comp_model_desc.host_gather.dst_idx != -1) { + const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx]; + const auto gather = request->get_tensor(gport); + + const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base]; + const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx]; + const auto lookup = request->get_tensor(lport); + ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather); + } + + LOG_DEBUG("Done"); +} + +void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr request) { + LOG_DEBUG("Binding results for Subgraph[" << idx << "]"); + LOG_BLOCK(); + + const auto& iodesc = m_subrequests_gio.at(idx); + for (auto&& it : iodesc.global_results) { + std::size_t result_idx{}, sub_out_idx{}; + std::tie(result_idx, sub_out_idx) = it; + const auto& g_port = m_npuw_model->outputs()[result_idx]; + const auto& s_port = request->get_outputs()[sub_out_idx]; + request->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor); + } + + LOG_DEBUG("Done"); +} + void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp index 6be64d676d6149..ae24dcfee11f9d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp @@ -19,8 +19,15 @@ namespace ov { namespace npuw { +using TensorPtr = ov::SoPtr; + class CompiledModel; +using LinkFrom = std::pair; // FIXME: This is a third, if not fourth, definitiion of such structure + // This interface is provided to npuw::AsyncInferRequest to manage the // individual subrequests' execution class IBaseInferRequest : public ov::ISyncInferRequest { @@ -40,6 +47,10 @@ class IBaseInferRequest : public ov::ISyncInferRequest { void check_tensors() const override; + // Query APIs - some default implementations here + std::vector> query_state() const override; + std::vector get_profiling_info() const override; + using sptr = std::shared_ptr; using Completed = std::function; @@ -50,7 +61,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest { virtual void run_subrequest_for_success(std::size_t idx, bool& failover) = 0; virtual void complete_subrequest(std::size_t idx) = 0; virtual void cancel_subrequest(std::size_t idx) = 0; - virtual std::size_t total_subrequests() const = 0; + virtual std::size_t total_subrequests() const; virtual bool supports_async_pipeline() const = 0; protected: @@ -107,8 +118,32 @@ class IBaseInferRequest : public ov::ISyncInferRequest { }; std::vector m_spatial_io; + // This structure tracks how every individual subrequest + // access the model's top-level (global, public, etc) parameters + // and results. Again, is managed by subclasses + struct GlobalIO { + using map_t = std::map; + map_t global_params; // param idx -> input idx + map_t global_results; // result idx -> output idx + }; + std::vector m_subrequests_gio; + + // Tracks tensors we allocated on our own - to recognize and avoid copies + std::unordered_set m_input_allocated; + + // Common functionality - shared for subclasses const std::size_t m_num_submodels; + TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device); + TensorPtr allocOut(const ov::Output& node, const std::string& device); + virtual void alloc_io(); + virtual TensorPtr alloc_global_out(std::size_t out_idx); + + virtual void init_gio(); + void unpack_closure(std::size_t idx, RqPtr request); + virtual void bind_global_params(std::size_t idx, RqPtr request); + virtual void bind_global_results(std::size_t idx, RqPtr request); + void dump_input_tensors(std::size_t idx); void dump_output_tensors(std::size_t idx); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 4110307ec1623e..c6be2793fe6f70 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -21,6 +21,7 @@ #include "openvino/util/common_util.hpp" #include "partitioning/patterns/opt.hpp" #include "plugin.hpp" +#include "unfold_sync_infer_request.hpp" #include "util.hpp" // required for get_properties_per_device() @@ -442,9 +443,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, } implement_properties(); - - m_finalized = true; - reset_io(); + report_io(); } void ov::npuw::CompiledModel::finalize_weights_bank() { @@ -570,19 +569,7 @@ void ov::npuw::CompiledModel::fill_empty_tensor_names(const std::shared_ptr ov::npuw::CompiledModel::create_just_sync_infer_request() { - auto this_sptr = std::static_pointer_cast(shared_from_this()); - return std::make_shared(this_sptr); -} - std::shared_ptr ov::npuw::CompiledModel::create_sync_infer_request() const { // Synchronous infer request implementation may vary based on the // selected strategy auto* non_const_this = const_cast(this); // because of const in API - return non_const_this->create_just_sync_infer_request(); + auto non_const_this_sptr = std::static_pointer_cast(non_const_this->shared_from_this()); + + auto no_spatial_unpack = [&]() { + const auto num_submodels = m_compiled_submodels.size(); + for (std::size_t idx = 0u; idx < num_submodels; idx++) { + const auto& comp_model_desc = m_compiled_submodels[idx]; + if (!comp_model_desc.replaced_by.has_value()) { + // not a funcall, do nothing + continue; + } + const auto real_idx = comp_model_desc.replaced_by.value(); + if (m_compiled_submodels[real_idx].spatial) { + LOG_WARN("Subgraph[" << idx << "] is a call to spatial function, unfold can't be done"); + return false; // Spatial graph + } + if (unpack_required(idx)) { + LOG_WARN("Subgraph[" << idx << "] requires unpack, unfold can't be done"); + return false; // Unpack required + } + } + return true; // no spatial & subgraphs requiring unpack found + }; + + std::shared_ptr result; + if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>() && no_spatial_unpack()) { + result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr)); + } else { + result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr)); + } + NPUW_ASSERT(result); + return result; } std::shared_ptr ov::npuw::CompiledModel::create_infer_request() const { @@ -776,6 +788,46 @@ std::string ov::npuw::CompiledModel::submodel_device(const std::size_t idx) cons return *comp_subm_desc.device_it; } +bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx) const { + auto& comp_model_desc = m_compiled_submodels.at(idx); + for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) { + if (unpack_required(idx, cidx)) { + return true; + } + } + return false; +} + +bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx, const std::size_t cidx) const { + if (is_gather_closure(idx, cidx)) { + return false; + } + + auto& comp_model_desc = m_compiled_submodels.at(idx); + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& func_desc = m_compiled_submodels.at(real_idx); + + auto& closure = comp_model_desc.closure.at(cidx); + const auto closure_param_id = comp_model_desc.param_base + cidx; + + auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; + return (closure.get_element_type() != iport.get_element_type()); +} + +bool ov::npuw::CompiledModel::is_gather_closure(const std::size_t idx, const std::size_t cidx) const { + auto& comp_model_desc = m_compiled_submodels.at(idx); + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& func_desc = m_compiled_submodels.at(real_idx); + + const auto closure_param_id = comp_model_desc.param_base + cidx; + + if (func_desc.host_gather.dst_idx != -1 && + static_cast(func_desc.host_gather.dst_idx) == closure_param_id) { + return true; + } + return false; +} + void ov::npuw::CompiledModel::log_device_dist() const { std::unordered_map stats_for_devices; execution_stats stats_for_optimized_out{0.f, 0ul}; @@ -935,6 +987,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE), BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE), BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC), + BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS), BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK), BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC), BIND(npuw::cache_dir, NPUW_CACHE_DIR), diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 6199ac66c0c64e..ece1bc78fb5bf5 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel { // FIXME: This class has many friends.. friend class IBaseInferRequest; friend class JustInferRequest; + friend class UnfoldInferRequest; friend class MemAccessSim; friend class FuncMemMgr; @@ -57,28 +58,27 @@ class CompiledModel : public ov::ICompiledModel { void dump_on_fail(std::size_t id, const std::string& device_to_stry, const char* extra); - bool m_finalized = false; - void reset_io(); + void report_io() const; // This is used for removing too long output tensor names to fix some compilation issues + // NB: These two methods has nothing to do with this particular class and should be + // moved elsewhere void remove_long_output_names(const std::shared_ptr& model); void fill_empty_tensor_names(const std::shared_ptr& model); std::shared_ptr get_npuw_plugin() const; - - std::shared_ptr create_just_sync_infer_request(); std::shared_ptr create_sync_infer_request() const override; std::string submodel_device(const std::size_t idx) const; + bool is_gather_closure(const std::size_t idx, const std::size_t cidx) const; + bool unpack_required(const std::size_t idx) const; + bool unpack_required(const std::size_t idx, const std::size_t cidx) const; void log_device_dist() const; - void implement_properties(); void finalize_weights_bank(); - std::string global_mem_device() const; - std::string funcall_mem_device(const std::size_t idx) const; std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 0e0b96582a663c..8d1c7c4a30acde 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -310,69 +310,9 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrinputs().size(); i++) { - const auto& port = m_npuw_model->inputs()[i]; - ov::SoPtr allocated = allocOut(port, m_npuw_model->global_mem_device()); - m_input_tensors.push_back(allocated); - m_input_allocated.insert(allocated->data()); - m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true}; - } // for(inputs) - - // Preallocate output tensors - LOG_INFO("Preallocating output tensors..."); - for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { - LOG_BLOCK(); - const auto& port = m_npuw_model->outputs()[i]; - LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port); - - // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom - const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i); - - LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second); - auto funcall_result_iter = m_funcall_result.find(from_submodel); - - const auto& tensor = - funcall_result_iter != m_funcall_result.end() - ? funcall_result_iter->second // Function calls have their tensors allocated, so just use one - : allocOut(port, m_npuw_model->global_mem_device()); - - m_output_tensors.push_back(tensor); - m_port_to_tensor[port] = TensorStorage{tensor, true}; - } + alloc_io(); connect_subrequests(); - - // Build the parameter/result mapping {{{ - m_subrequests_gio.resize(m_subrequests.size()); - - // Parameters: stage 1... - for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) { - const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i); - if (to_submodel != CompiledModel::NO_LINK) { - std::size_t sub_idx{}, in_idx{}; - std::tie(sub_idx, in_idx) = to_submodel; - m_subrequests_gio.at(sub_idx).global_params[i] = in_idx; - } - } // for(inputs) - - // Parameters: stage 2... - for (auto&& it : m_npuw_model->m_param_subscribers) { - const auto param_idx = it.first; - for (auto&& to_submodel : it.second) { - std::size_t sub_idx{}, in_idx{}; - std::tie(sub_idx, in_idx) = to_submodel; - m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx; - } - } - - // Results - for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { - std::size_t sub_idx{}, out_idx{}; - std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i); - m_subrequests_gio.at(sub_idx).global_results[i] = out_idx; - } - // }}} + init_gio(); for (size_t i = 0; i < m_num_submodels; i++) { LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]..."); @@ -413,6 +353,15 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_outputs_to_submodels_outputs.at(out_idx); + auto funcall_result_iter = m_funcall_result.find(from_submodel); + if (funcall_result_iter != m_funcall_result.end()) { + return funcall_result_iter->second; + } + return IBaseInferRequest::alloc_global_out(out_idx); +} + void ov::npuw::JustInferRequest::connect_subrequests() { LOG_INFO("Connecting subrequests..."); LOG_BLOCK(); @@ -478,33 +427,6 @@ void ov::npuw::JustInferRequest::connect_subrequests() { LOG_INFO("Done"); } -std::vector> ov::npuw::JustInferRequest::query_state() const { - std::vector> variable_states = {}; - for (const auto& request : m_subrequests) { - if (!request) // optimized out - continue; - for (auto&& state : request->query_state()) { - if (!state._so) - state._so = request._so; - variable_states.emplace_back(state); - } - } - return variable_states; -} - -std::vector ov::npuw::JustInferRequest::get_profiling_info() const { - std::vector info; - for (size_t i = 0; i < m_subrequests.size(); ++i) { - if (!m_subrequests[i]) // optimized out - continue; - auto&& subreq_info = m_subrequests[i]->get_profiling_info(); - for (auto&& rec : subreq_info) - rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name; - info.insert(info.end(), subreq_info.begin(), subreq_info.end()); - } - return info; -} - void ov::npuw::JustInferRequest::prepare_for_infer() { LOG_DEBUG("Preparing to infer..."); LOG_BLOCK(); @@ -542,118 +464,36 @@ void ov::npuw::JustInferRequest::start_subrequest(std::size_t idx) { } void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) { - LOG_DEBUG("Binding parameters for Subgraph[" << idx << "]"); - LOG_BLOCK(); - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; const auto real_idx = comp_model_desc.replaced_by.value_or(idx); - const bool do_copy = needs_copy(idx); - const auto& iodesc = m_subrequests_gio.at(idx); - - const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; - const bool is_spatial = proto_comp_model_desc.spatial.has_value(); - - // a list of ports to copy tensors, if needed: FROM -> TO - std::vector, ov::Output>> copy_list; - // pick which subrequest we actually work on here - auto subr = [&]() { - if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) { - LOG_DEBUG("Accessing the pipeline subrequest"); - // The real index of request we need to prepare IS - // the same request which executes now AND - // function_pipelining enabled - select the reserve request. - NPUW_ASSERT(m_funcall_pipeline[real_idx].subrequest); - return m_funcall_pipeline[real_idx].subrequest; - } + if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) { + LOG_DEBUG("Accessing the pipeline subrequest"); + // The real index of request we need to prepare IS + // the same request which executes now AND + // function_pipelining enabled - select the reserve request. + NPUW_ASSERT(m_funcall_pipeline[real_idx].subrequest); + bind_global_params(idx, m_funcall_pipeline[real_idx].subrequest); + } else { // Otherwise: Just a return a subrequest which is in place. // If it is a function call and we have function pipelining ON, // it is still the right subrequest we can use. LOG_DEBUG("Accessing the primary subrequest"); - return m_subrequests[real_idx]; - }(); - - // Check if the given subgraph's input is spatial - auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool { - if (!is_spatial) { - return false; // Early return - } - auto& spatial = proto_comp_model_desc.spatial.value(); - return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool { - return p.idx == sub_in_idx; - }); - }; - - for (auto&& it : iodesc.global_params) { - std::size_t param_idx{}, sub_in_idx{}; - std::tie(param_idx, sub_in_idx) = it; - LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl); - - const auto& g_port = m_npuw_model->inputs()[param_idx]; - const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor; - const auto& s_port = subr->get_inputs()[sub_in_idx]; - LOG_DEBUG("Processing " << g_port << " -> " << s_port << "..."); - LOG_BLOCK(); - if (!is_spatial_param(sub_in_idx)) { - // Input parameter is non-spatial, do normal handling - if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) { - LOG_DEBUG("Will be copied"); - copy_list.emplace_back(g_tnsr, s_port); - } else { - LOG_DEBUG("Will be set"); - subr->set_tensor(s_port, g_tnsr); - } - } else { - // Register for future use - m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr; - } + bind_global_params(idx, m_subrequests[real_idx]); } - - LOG_DEBUG("Running copy..."); - ov::parallel_for(copy_list.size(), [&](std::size_t idx) { - auto& it = copy_list[idx]; - ov::SoPtr dst = subr->get_tensor(it.second); - it.first->copy_to(dst._ptr); - }); - - // Run host-side gather, if required - if (comp_model_desc.host_gather.dst_idx != -1) { - const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx]; - const auto gather = subr->get_tensor(gport); - - const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base]; - const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx]; - const auto lookup = subr->get_tensor(lport); - ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather); - } - - LOG_DEBUG("Done"); } void ov::npuw::JustInferRequest::bind_global_results(std::size_t idx) { - LOG_DEBUG("Binding results for Subgraph[" << idx << "]"); - LOG_BLOCK(); - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; if (comp_model_desc.replaced_by) { // Don't do here - function call will take the right tensor // itself. Note it may be implemented more efficently than now // (and in some cases, the tensor can be pre-set) - LOG_DEBUG("Skipping this too now - function will do it for itself"); + LOG_DEBUG("Skipping bind_glo - function will do it for itself"); return; } - - const auto& iodesc = m_subrequests_gio.at(idx); - for (auto&& it : iodesc.global_results) { - std::size_t result_idx{}, sub_out_idx{}; - std::tie(result_idx, sub_out_idx) = it; - const auto& g_port = m_npuw_model->outputs()[result_idx]; - const auto& s_port = m_subrequests[idx]->get_outputs()[sub_out_idx]; - m_subrequests[idx]->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor); - } - - LOG_DEBUG("Done"); + IBaseInferRequest::bind_global_results(idx, m_subrequests[idx]); } void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) { @@ -737,88 +577,6 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) { LOG_DEBUG("Done"); } -void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) { - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; - - NPUW_ASSERT(comp_model_desc.replaced_by); - const auto real_idx = comp_model_desc.replaced_by.value(); - auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx]; - - // Bind extra parameters from the function's closure - // First, do easy things & delay heavy stuff - std::vector closure_unpack_required; - std::vector closure_copy_required; - - for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) { - auto& closure = comp_model_desc.closure[cidx]; - - const auto closure_param_id = comp_model_desc.param_base + cidx; - - if (func_desc.host_gather.dst_idx != -1 && - static_cast(func_desc.host_gather.dst_idx) == closure_param_id) { - // No need to set/copy the host_gather's closure tensor int - // the subrequest - it is just a dummy. host_gather writes - // to the right buffer directly. - continue; - } - - auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; - if (closure.get_element_type() != iport.get_element_type()) { - // Remember where the unpack is required - closure_unpack_required.push_back(cidx); - } else { - if (needs_copy(idx, cidx)) { - // Remember where copy is requried - closure_copy_required.push_back(cidx); - } else { - // Easy case, just set one to another - request->set_tensor(iport, ov::get_tensor_impl(closure)); - } - } - } // for(closure) - - // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){ - ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) { - auto cidx = closure_copy_required[j]; - auto& closure = comp_model_desc.closure[cidx]; - const auto closure_param_id = comp_model_desc.param_base + cidx; - auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; - auto clparam = request->get_tensor(iport); - ov::get_tensor_impl(closure)->copy_to(clparam._ptr); - }); - // }); // ms_to_run - - for (std::size_t j = 0; j != closure_unpack_required.size(); j++) { - // NB: No need to protect anything here as containers are all - // preallocated and we only access elements under particular (thread - // -local) indices. - auto cidx = closure_unpack_required[j]; - - // FIXME: zerops are stored with absolute indexing, this needs to be aligned - auto& closure = comp_model_desc.closure[cidx]; - - const auto closure_param_id = comp_model_desc.param_base + cidx; - auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; - auto clparam = request->get_tensor(iport); - - if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) { - // Unpacking this weight requires scaling with zero points... - ov::npuw::util::unpack(ov::get_tensor_impl(closure), - ov::get_tensor_impl(comp_model_desc.zerops[cidx]), - ov::get_tensor_impl(comp_model_desc.scales[cidx]), - clparam); - } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) { - // Unpacking this weight requires scaling - ov::npuw::util::unpack(ov::get_tensor_impl(closure), - ov::get_tensor_impl(comp_model_desc.scales[cidx]), - clparam); - } else { - // Unpacking this weight doesn't require scaling - ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam); - } - } -} - void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) { auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; auto real_idx = comp_model_desc.replaced_by.value_or(idx); @@ -1110,24 +868,6 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool } // if (replaced_by) } -ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type, - const ov::Shape& shape, - const std::string& device) { - if (device == "CPU" || ov::shape_size(shape) == 0) { - return ov::get_tensor_impl(ov::Tensor(type, shape)); - } - - // Protect access to shared context(s) - at least among infer requests - auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr; - auto remote_tensor = remote_ctx->create_host_tensor(type, shape); - return ov::get_tensor_impl(ov::make_tensor(remote_tensor)); -} - -ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output& node, - const std::string& device) { - return allocMem(node.get_element_type(), node.get_shape(), device); -} - void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) { get_real_subrequest(idx)->set_callback(std::move(cb)); } @@ -1140,10 +880,6 @@ void ov::npuw::JustInferRequest::cancel_subrequest(std::size_t idx) { m_subrequests[idx]->cancel(); } -std::size_t ov::npuw::JustInferRequest::total_subrequests() const { - return m_subrequests.size(); -} - bool ov::npuw::JustInferRequest::supports_async_pipeline() const { return false; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp index d219f170a8e6bb..a935220b4b8943 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp @@ -23,13 +23,6 @@ namespace npuw { class CompiledModel; class AsyncInferRequest; -using LinkFrom = std::pair; // FIXME: This is a third, if not fourth, definitiion of such structure - -using TensorPtr = ov::SoPtr; - class MemAccessSim { public: explicit MemAccessSim(const std::shared_ptr& compiled_model); @@ -77,11 +70,7 @@ class JustInferRequest final : public IBaseInferRequest { public: explicit JustInferRequest(const std::shared_ptr& compiled_model); - // Query APIs - std::vector> query_state() const override; - std::vector get_profiling_info() const override; - -private: +protected: //////////////////////////////////// // implement IBaseInferRequest void prepare_for_infer() override; @@ -91,11 +80,11 @@ class JustInferRequest final : public IBaseInferRequest { void subscribe_subrequest(std::size_t idx, Completed cb) override; void complete_subrequest(std::size_t idx) override; void cancel_subrequest(std::size_t idx) override; - std::size_t total_subrequests() const override; bool supports_async_pipeline() const override; - void update_subrequest_links(std::size_t idx) override; + TensorPtr alloc_global_out(std::size_t out_idx) override; + //////////////////////////////////// // now own API @@ -104,9 +93,9 @@ class JustInferRequest final : public IBaseInferRequest { void bind_global_parameters(std::size_t idx); void bind_global_results(std::size_t idx); + using IBaseInferRequest::bind_global_results; void function_prologue(std::size_t idx); - void unpack_closure(std::size_t idx, RqPtr request); void unsafe_during(std::size_t real_idx, const std::function& f); void unsafe_infer(std::size_t real_idx); @@ -115,9 +104,6 @@ class JustInferRequest final : public IBaseInferRequest { void connect_subrequests(); void recreate_subrequests(std::size_t idx); - TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device); - TensorPtr allocOut(const ov::Output& node, const std::string& device); - FuncMemMgr m_func_mem_mgr; // Owns memory std::map m_funcall_result; // Provides a convenient link @@ -139,18 +125,6 @@ class JustInferRequest final : public IBaseInferRequest { // initialized. std::vector m_funcall_pipeline; - // This structure tracks how every individual subrequest - // access the model's top-level (global, public, etc) parameters - // and results - struct GlobalIO { - using map_t = std::map; - map_t global_params; // param idx -> input idx - map_t global_results; // result idx -> output idx - }; - std::vector m_subrequests_gio; - - std::unordered_set m_input_allocated; - // Represents spatial run-time info runtime::spatial::Selector::Ptr m_spatial_selector; diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp new file mode 100644 index 00000000000000..90eb62dcc0a8e3 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "unfold_sync_infer_request.hpp" + +#include "compiled_model.hpp" +#include "logging.hpp" +#include "openvino/core/parallel.hpp" + +ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr& compiled_model) + : ov::npuw::IBaseInferRequest(compiled_model) { + // Create infer requests + // Preallocate funcall tensors & substitute function call requests + for (std::size_t i = 0; i < m_num_submodels; i++) { + LOG_INFO("Creating infer request for Subgraph[" << i << "]..."); + LOG_BLOCK(); + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i]; + + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + // no model & no funcall - optimized out, do nothing + LOG_INFO("OPTIMIZED OUT"); + continue; + } + + if (comp_model_desc.replaced_by) { + // Pre-allocate output tensors for this function call + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + if (proto_comp_model_desc.spatial) { + NPUW_ASSERT(false && "Spatial is not supported in unfold"); + } + } // if(replaced_by) + + const auto real_idx = comp_model_desc.replaced_by.value_or(i); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + // NB: UnfoldInferRequest is _NOT_ fail-safe! Fail means fail here + m_subrequests[i] = proto_comp_model_desc.compiled_model->create_infer_request(); + m_subrequest_devices[i] = *proto_comp_model_desc.device_it; + LOG_INFO("DONE"); + } // for(submodels) + + alloc_io(); + + LOG_INFO("Connecting subrequests..."); + LOG_BLOCK(); + for (const auto& kvp : m_npuw_model->m_submodels_input_to_prev_output) { + const auto& subm_idx_to = kvp.first.first; + const auto& port_idx_to = kvp.first.second; + const auto& subm_idx_from = kvp.second.first; + const auto& port_idx_from = kvp.second.second; + + LOG_DEBUG("Subgraph[" << subm_idx_from << "]/" << port_idx_from << " --> " + << "Subgraph[" << subm_idx_to << "]/" << port_idx_to); + NPUW_ASSERT(m_subrequests[subm_idx_from]); // prod request is created + NPUW_ASSERT(m_subrequests[subm_idx_to]); // cons request is created + NPUW_ASSERT(m_subrequests[subm_idx_from]._ptr != m_subrequests[subm_idx_to]._ptr); + + const auto& iport = m_subrequests[subm_idx_to]->get_compiled_model()->inputs()[port_idx_to]; + const auto& oport = m_subrequests[subm_idx_from]->get_compiled_model()->outputs()[port_idx_from]; + const auto& tensor = m_subrequests[subm_idx_from]->get_tensor(oport); + LOG_DEBUG("Set Subgraph[" << subm_idx_to << "]/" << iport << " to Subgraph[" << subm_idx_from << "]/" << oport); + m_subrequests[subm_idx_to]->set_tensor(iport, tensor); + } // for(map) + LOG_INFO("Done"); + + init_gio(); + + for (size_t i = 0; i < m_num_submodels; i++) { + LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]..."); + LOG_BLOCK(); + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i]; + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + continue; // Optimized out + } + unpack_closure(i, m_subrequests[i]); + LOG_VERB("Done"); + } +} + +bool ov::npuw::UnfoldInferRequest::valid_subrequest(std::size_t idx) const { + return m_subrequests.at(idx) != nullptr; +} + +void ov::npuw::UnfoldInferRequest::infer() { + const bool do_async = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>(); + + auto prepare = [&](std::size_t idx) { + if (idx >= m_subrequests.size()) { + return; + } + bind_global_params(idx, m_subrequests[idx]); + bind_global_results(idx, m_subrequests[idx]); + }; + auto wait_and_clear = [](RqPtrs& rqs) { + for (auto&& r : rqs) { + r->wait(); + } + rqs.clear(); + }; + + if (do_async) { + std::size_t past_repl_id = 0u; + RqPtrs previous_requests; + + prepare(0); + for (std::size_t idx = 0; idx < m_num_submodels; idx++) { + auto& subr = m_subrequests[idx]; + if (!subr) { + prepare(idx + 1); + continue; + } + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; + const auto this_repl_id = comp_model_desc.replaced_by.value_or(idx); + if (this_repl_id != past_repl_id) { + // For non-repeating blocks, the above value_or returns idx + // For repeating blocks, it returns the function group id + // If either is not equal to the past_repl_id, make a barrier here + wait_and_clear(previous_requests); + past_repl_id = this_repl_id; + } + subr->start_async(); + previous_requests.push_back(subr); + prepare(idx + 1); + } + wait_and_clear(previous_requests); + } else { + prepare(0); + for (std::size_t idx = 0; idx < m_num_submodels; idx++) { + auto& subr = m_subrequests[idx]; + if (!subr) { + prepare(idx + 1); + continue; + } + subr->start_async(); + prepare(idx + 1); + subr->wait(); + } + } // (async) +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp new file mode 100644 index 00000000000000..76b67571ec4c40 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +#include "base_sync_infer_request.hpp" + +namespace ov { +namespace npuw { + +class UnfoldInferRequest final : public IBaseInferRequest { +public: + explicit UnfoldInferRequest(const std::shared_ptr& compiled_model); + + //////////////////////////////////// + // implement IBaseInferRequest - nether of these are required here + // this hierarchy needs revew + void prepare_for_infer() override {} + bool valid_subrequest(std::size_t idx) const override; + void start_subrequest(std::size_t) override {} + void run_subrequest_for_success(std::size_t, bool&) override {} + void subscribe_subrequest(std::size_t, Completed cb) override {} + void complete_subrequest(std::size_t) override {} + void cancel_subrequest(std::size_t) override {} + bool supports_async_pipeline() const override { + return false; + } + void update_subrequest_links(std::size_t) override {} + +private: + void infer() override; +}; + +} // namespace npuw +} // namespace ov