From 3d8b5eb4e20793e2af707cdbe5f776bf41b2506e Mon Sep 17 00:00:00 2001 From: Dmitry Matveev Date: Thu, 21 Nov 2024 15:18:20 +0000 Subject: [PATCH 1/3] NPUW: Unfold infer requests (#27319) ### Details: - *item1* - *...* ### Tickets: - E-140517 --- .../src/al/include/intel_npu/config/npuw.hpp | 1 + .../intel_npu/npuw_private_properties.hpp | 8 + .../intel_npu/src/al/src/config/npuw.cpp | 1 + .../plugin/npuw/base_sync_infer_request.cpp | 291 ++++++++++++++++- .../plugin/npuw/base_sync_infer_request.hpp | 37 ++- .../src/plugin/npuw/compiled_model.cpp | 97 ++++-- .../src/plugin/npuw/compiled_model.hpp | 14 +- .../plugin/npuw/just_sync_infer_request.cpp | 308 ++---------------- .../plugin/npuw/just_sync_infer_request.hpp | 34 +- .../plugin/npuw/unfold_sync_infer_request.cpp | 140 ++++++++ .../plugin/npuw/unfold_sync_infer_request.hpp | 42 +++ 11 files changed, 621 insertions(+), 352 deletions(-) create mode 100644 src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp create mode 100644 src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 7b0dab3d16da3c..edd5b1367c217f 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -56,6 +56,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime); DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime); DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime); DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime); +DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime); DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime); DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime); DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index 67dce9621bfb4e..d7761979339eb5 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -279,6 +279,14 @@ static constexpr ov::Property parallel_compilation{"NPUW_PARALLEL_COMPILE" */ static constexpr ov::Property funcall_async{"NPUW_FUNCALL_ASYNC"}; +/** + * @brief + * Type: boolean + * Create individual infer requests for partitiongs, even repeating. + * Default value: false. + */ +static constexpr ov::Property unfold_ireqs{"NPUW_UNFOLD_IREQS"}; + namespace accuracy { /** * @brief diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 6a519a0f754a32..a4478ba3c9dcd2 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -38,6 +38,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp index 216b1a35b4315c..77d000cb415de7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp @@ -7,6 +7,7 @@ #include "compiled_model.hpp" #include "intel_npu/config/npuw.hpp" #include "logging.hpp" +#include "openvino/core/parallel.hpp" #include "util.hpp" ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr& compiled_model) @@ -58,12 +59,8 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re LOG_INFO("- Trying next device..."); comp_model_desc.device_it++; can_try_again = m_npuw_model->compile_for_success(id); - if (can_try_again) { - if (recompiled) - *recompiled = true; - // Probably shouldn't be called all the time, but only if - // I/O submodel is affected - m_npuw_model->reset_io(); + if (can_try_again && recompiled) { + *recompiled = true; } } } // while(!new_ireq && can_try_again) @@ -178,6 +175,33 @@ void ov::npuw::IBaseInferRequest::check_tensors() const { return; } +std::vector> ov::npuw::IBaseInferRequest::query_state() const { + std::vector> variable_states = {}; + for (const auto& request : m_subrequests) { + if (!request) // optimized out + continue; + for (auto&& state : request->query_state()) { + if (!state._so) + state._so = request._so; + variable_states.emplace_back(state); + } + } + return variable_states; +} + +std::vector ov::npuw::IBaseInferRequest::get_profiling_info() const { + std::vector info; + for (size_t i = 0; i < m_subrequests.size(); ++i) { + if (!m_subrequests[i]) // optimized out + continue; + auto&& subreq_info = m_subrequests[i]->get_profiling_info(); + for (auto&& rec : subreq_info) + rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name; + info.insert(info.end(), subreq_info.begin(), subreq_info.end()); + } + return info; +} + void ov::npuw::IBaseInferRequest::infer() { m_now_idx.reset(); prepare_for_infer(); @@ -209,6 +233,261 @@ void ov::npuw::IBaseInferRequest::infer() { m_now_idx.reset(); } +std::size_t ov::npuw::IBaseInferRequest::total_subrequests() const { + return m_subrequests.size(); +} + +ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocMem(const ov::element::Type type, + const ov::Shape& shape, + const std::string& device) { + if (device == "CPU" || ov::shape_size(shape) == 0) { + return ov::get_tensor_impl(ov::Tensor(type, shape)); + } + + auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr; + auto remote_tensor = remote_ctx->create_host_tensor(type, shape); + return ov::get_tensor_impl(ov::make_tensor(remote_tensor)); +} + +ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocOut(const ov::Output& node, + const std::string& device) { + return allocMem(node.get_element_type(), node.get_shape(), device); +} + +void ov::npuw::IBaseInferRequest::alloc_io() { + // Preallocate input tensors + LOG_INFO("Preallocating input tensors..."); + for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) { + const auto& port = m_npuw_model->inputs()[i]; + ov::SoPtr allocated = allocOut(port, m_npuw_model->global_mem_device()); + m_input_tensors.push_back(allocated); + m_input_allocated.insert(allocated->data()); + m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true}; + } // for(inputs) + + // Preallocate output tensors + LOG_INFO("Preallocating output tensors..."); + for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { + LOG_BLOCK(); + const auto& port = m_npuw_model->outputs()[i]; + LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port); + + // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom + const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i); + LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second); + + auto tensor = alloc_global_out(i); + m_output_tensors.push_back(tensor); + m_port_to_tensor[port] = TensorStorage{tensor, true}; + } +} + +ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) { + const auto& port = m_npuw_model->outputs().at(out_idx); + return allocOut(port, m_npuw_model->global_mem_device()); +} + +void ov::npuw::IBaseInferRequest::init_gio() { + // Build the parameter/result mapping + m_subrequests_gio.resize(m_subrequests.size()); + + // Parameters: stage 1... + for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) { + const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i); + if (to_submodel != CompiledModel::NO_LINK) { + std::size_t sub_idx{}, in_idx{}; + std::tie(sub_idx, in_idx) = to_submodel; + m_subrequests_gio.at(sub_idx).global_params[i] = in_idx; + } + } // for(inputs) + + // Parameters: stage 2... + for (auto&& it : m_npuw_model->m_param_subscribers) { + const auto param_idx = it.first; + for (auto&& to_submodel : it.second) { + std::size_t sub_idx{}, in_idx{}; + std::tie(sub_idx, in_idx) = to_submodel; + m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx; + } + } + + // Results + for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { + std::size_t sub_idx{}, out_idx{}; + std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i); + m_subrequests_gio.at(sub_idx).global_results[i] = out_idx; + } +} + +void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request) { + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; + + NPUW_ASSERT(comp_model_desc.replaced_by); + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx]; + + // Bind extra parameters from the function's closure + // First, do easy things & delay heavy stuff + std::vector closure_unpack_required; + std::vector closure_copy_required; + + for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) { + auto& closure = comp_model_desc.closure[cidx]; + const auto closure_param_id = comp_model_desc.param_base + cidx; + + if (m_npuw_model->is_gather_closure(idx, cidx)) { + // No need to set/copy the host_gather's closure tensor int + // the subrequest - it is just a dummy. host_gather writes + // to the right buffer directly. + continue; + } + + auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; + if (m_npuw_model->unpack_required(idx, cidx)) { + // Remember where the unpack is required + closure_unpack_required.push_back(cidx); + } else { + if (needs_copy(idx, cidx)) { + // Remember where copy is requried + closure_copy_required.push_back(cidx); + } else { + // Easy case, just set one to another + request->set_tensor(iport, ov::get_tensor_impl(closure)); + } + } + } // for(closure) + + // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){ + ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) { + auto cidx = closure_copy_required[j]; + auto& closure = comp_model_desc.closure[cidx]; + const auto closure_param_id = comp_model_desc.param_base + cidx; + auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; + auto clparam = request->get_tensor(iport); + ov::get_tensor_impl(closure)->copy_to(clparam._ptr); + }); + // }); // ms_to_run + + for (std::size_t j = 0; j != closure_unpack_required.size(); j++) { + // NB: No need to protect anything here as containers are all + // preallocated and we only access elements under particular (thread + // -local) indices. + auto cidx = closure_unpack_required[j]; + + // FIXME: zerops are stored with absolute indexing, this needs to be aligned + auto& closure = comp_model_desc.closure[cidx]; + + const auto closure_param_id = comp_model_desc.param_base + cidx; + auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; + auto clparam = request->get_tensor(iport); + + if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) { + // Unpacking this weight requires scaling with zero points... + ov::npuw::util::unpack(ov::get_tensor_impl(closure), + ov::get_tensor_impl(comp_model_desc.zerops[cidx]), + ov::get_tensor_impl(comp_model_desc.scales[cidx]), + clparam); + } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) { + // Unpacking this weight requires scaling + ov::npuw::util::unpack(ov::get_tensor_impl(closure), + ov::get_tensor_impl(comp_model_desc.scales[cidx]), + clparam); + } else { + // Unpacking this weight doesn't require scaling + ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam); + } + } +} + +void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr request) { + LOG_DEBUG("Binding parameters for Subgraph[" << idx << "]"); + LOG_BLOCK(); + + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; + const auto real_idx = comp_model_desc.replaced_by.value_or(idx); + + const bool do_copy = needs_copy(idx); + const auto& iodesc = m_subrequests_gio.at(idx); + + const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + const bool is_spatial = proto_comp_model_desc.spatial.has_value(); + + // a list of ports to copy tensors, if needed: FROM -> TO + std::vector, ov::Output>> copy_list; + + // Check if the given subgraph's input is spatial + auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool { + if (!is_spatial) { + return false; // Early return + } + auto& spatial = proto_comp_model_desc.spatial.value(); + return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool { + return p.idx == sub_in_idx; + }); + }; + + for (auto&& it : iodesc.global_params) { + std::size_t param_idx{}, sub_in_idx{}; + std::tie(param_idx, sub_in_idx) = it; + LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl); + + const auto& g_port = m_npuw_model->inputs()[param_idx]; + const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor; + const auto& s_port = request->get_inputs()[sub_in_idx]; + LOG_DEBUG("Processing " << g_port << " -> " << s_port << "..."); + LOG_BLOCK(); + if (!is_spatial_param(sub_in_idx)) { + // Input parameter is non-spatial, do normal handling + if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) { + LOG_DEBUG("Will be copied"); + copy_list.emplace_back(g_tnsr, s_port); + } else { + LOG_DEBUG("Will be set"); + request->set_tensor(s_port, g_tnsr); + } + } else { + // Register for future use + m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr; + } + } + + LOG_DEBUG("Running copy..."); + ov::parallel_for(copy_list.size(), [&](std::size_t idx) { + auto& it = copy_list[idx]; + ov::SoPtr dst = request->get_tensor(it.second); + it.first->copy_to(dst._ptr); + }); + + // Run host-side gather, if required + if (comp_model_desc.host_gather.dst_idx != -1) { + const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx]; + const auto gather = request->get_tensor(gport); + + const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base]; + const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx]; + const auto lookup = request->get_tensor(lport); + ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather); + } + + LOG_DEBUG("Done"); +} + +void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr request) { + LOG_DEBUG("Binding results for Subgraph[" << idx << "]"); + LOG_BLOCK(); + + const auto& iodesc = m_subrequests_gio.at(idx); + for (auto&& it : iodesc.global_results) { + std::size_t result_idx{}, sub_out_idx{}; + std::tie(result_idx, sub_out_idx) = it; + const auto& g_port = m_npuw_model->outputs()[result_idx]; + const auto& s_port = request->get_outputs()[sub_out_idx]; + request->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor); + } + + LOG_DEBUG("Done"); +} + void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp index 6be64d676d6149..ae24dcfee11f9d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp @@ -19,8 +19,15 @@ namespace ov { namespace npuw { +using TensorPtr = ov::SoPtr; + class CompiledModel; +using LinkFrom = std::pair; // FIXME: This is a third, if not fourth, definitiion of such structure + // This interface is provided to npuw::AsyncInferRequest to manage the // individual subrequests' execution class IBaseInferRequest : public ov::ISyncInferRequest { @@ -40,6 +47,10 @@ class IBaseInferRequest : public ov::ISyncInferRequest { void check_tensors() const override; + // Query APIs - some default implementations here + std::vector> query_state() const override; + std::vector get_profiling_info() const override; + using sptr = std::shared_ptr; using Completed = std::function; @@ -50,7 +61,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest { virtual void run_subrequest_for_success(std::size_t idx, bool& failover) = 0; virtual void complete_subrequest(std::size_t idx) = 0; virtual void cancel_subrequest(std::size_t idx) = 0; - virtual std::size_t total_subrequests() const = 0; + virtual std::size_t total_subrequests() const; virtual bool supports_async_pipeline() const = 0; protected: @@ -107,8 +118,32 @@ class IBaseInferRequest : public ov::ISyncInferRequest { }; std::vector m_spatial_io; + // This structure tracks how every individual subrequest + // access the model's top-level (global, public, etc) parameters + // and results. Again, is managed by subclasses + struct GlobalIO { + using map_t = std::map; + map_t global_params; // param idx -> input idx + map_t global_results; // result idx -> output idx + }; + std::vector m_subrequests_gio; + + // Tracks tensors we allocated on our own - to recognize and avoid copies + std::unordered_set m_input_allocated; + + // Common functionality - shared for subclasses const std::size_t m_num_submodels; + TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device); + TensorPtr allocOut(const ov::Output& node, const std::string& device); + virtual void alloc_io(); + virtual TensorPtr alloc_global_out(std::size_t out_idx); + + virtual void init_gio(); + void unpack_closure(std::size_t idx, RqPtr request); + virtual void bind_global_params(std::size_t idx, RqPtr request); + virtual void bind_global_results(std::size_t idx, RqPtr request); + void dump_input_tensors(std::size_t idx); void dump_output_tensors(std::size_t idx); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index b52dd40ea59364..8770dee0d68eea 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -21,6 +21,7 @@ #include "openvino/util/common_util.hpp" #include "partitioning/patterns/opt.hpp" #include "plugin.hpp" +#include "unfold_sync_infer_request.hpp" #include "util.hpp" // required for get_properties_per_device() @@ -442,9 +443,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, } implement_properties(); - - m_finalized = true; - reset_io(); + report_io(); } void ov::npuw::CompiledModel::finalize_weights_bank() { @@ -570,19 +569,7 @@ void ov::npuw::CompiledModel::fill_empty_tensor_names(const std::shared_ptr ov::npuw::CompiledModel::create_just_sync_infer_request() { - auto this_sptr = std::static_pointer_cast(shared_from_this()); - return std::make_shared(this_sptr); -} - std::shared_ptr ov::npuw::CompiledModel::create_sync_infer_request() const { // Synchronous infer request implementation may vary based on the // selected strategy auto* non_const_this = const_cast(this); // because of const in API - return non_const_this->create_just_sync_infer_request(); + auto non_const_this_sptr = std::static_pointer_cast(non_const_this->shared_from_this()); + + auto no_spatial_unpack = [&]() { + const auto num_submodels = m_compiled_submodels.size(); + for (std::size_t idx = 0u; idx < num_submodels; idx++) { + const auto& comp_model_desc = m_compiled_submodels[idx]; + if (!comp_model_desc.replaced_by.has_value()) { + // not a funcall, do nothing + continue; + } + const auto real_idx = comp_model_desc.replaced_by.value(); + if (m_compiled_submodels[real_idx].spatial) { + LOG_WARN("Subgraph[" << idx << "] is a call to spatial function, unfold can't be done"); + return false; // Spatial graph + } + if (unpack_required(idx)) { + LOG_WARN("Subgraph[" << idx << "] requires unpack, unfold can't be done"); + return false; // Unpack required + } + } + return true; // no spatial & subgraphs requiring unpack found + }; + + std::shared_ptr result; + if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>() && no_spatial_unpack()) { + result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr)); + } else { + result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr)); + } + NPUW_ASSERT(result); + return result; } std::shared_ptr ov::npuw::CompiledModel::create_infer_request() const { @@ -776,6 +788,46 @@ std::string ov::npuw::CompiledModel::submodel_device(const std::size_t idx) cons return *comp_subm_desc.device_it; } +bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx) const { + auto& comp_model_desc = m_compiled_submodels.at(idx); + for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) { + if (unpack_required(idx, cidx)) { + return true; + } + } + return false; +} + +bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx, const std::size_t cidx) const { + if (is_gather_closure(idx, cidx)) { + return false; + } + + auto& comp_model_desc = m_compiled_submodels.at(idx); + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& func_desc = m_compiled_submodels.at(real_idx); + + auto& closure = comp_model_desc.closure.at(cidx); + const auto closure_param_id = comp_model_desc.param_base + cidx; + + auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; + return (closure.get_element_type() != iport.get_element_type()); +} + +bool ov::npuw::CompiledModel::is_gather_closure(const std::size_t idx, const std::size_t cidx) const { + auto& comp_model_desc = m_compiled_submodels.at(idx); + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& func_desc = m_compiled_submodels.at(real_idx); + + const auto closure_param_id = comp_model_desc.param_base + cidx; + + if (func_desc.host_gather.dst_idx != -1 && + static_cast(func_desc.host_gather.dst_idx) == closure_param_id) { + return true; + } + return false; +} + void ov::npuw::CompiledModel::log_device_dist() const { std::unordered_map stats_for_devices; execution_stats stats_for_optimized_out{0.f, 0ul}; @@ -934,6 +986,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE), BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE), BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC), + BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS), BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK), BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC), BIND(npuw::cache_dir, NPUW_CACHE_DIR), diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 6199ac66c0c64e..ece1bc78fb5bf5 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel { // FIXME: This class has many friends.. friend class IBaseInferRequest; friend class JustInferRequest; + friend class UnfoldInferRequest; friend class MemAccessSim; friend class FuncMemMgr; @@ -57,28 +58,27 @@ class CompiledModel : public ov::ICompiledModel { void dump_on_fail(std::size_t id, const std::string& device_to_stry, const char* extra); - bool m_finalized = false; - void reset_io(); + void report_io() const; // This is used for removing too long output tensor names to fix some compilation issues + // NB: These two methods has nothing to do with this particular class and should be + // moved elsewhere void remove_long_output_names(const std::shared_ptr& model); void fill_empty_tensor_names(const std::shared_ptr& model); std::shared_ptr get_npuw_plugin() const; - - std::shared_ptr create_just_sync_infer_request(); std::shared_ptr create_sync_infer_request() const override; std::string submodel_device(const std::size_t idx) const; + bool is_gather_closure(const std::size_t idx, const std::size_t cidx) const; + bool unpack_required(const std::size_t idx) const; + bool unpack_required(const std::size_t idx, const std::size_t cidx) const; void log_device_dist() const; - void implement_properties(); void finalize_weights_bank(); - std::string global_mem_device() const; - std::string funcall_mem_device(const std::size_t idx) const; std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 0e0b96582a663c..8d1c7c4a30acde 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -310,69 +310,9 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrinputs().size(); i++) { - const auto& port = m_npuw_model->inputs()[i]; - ov::SoPtr allocated = allocOut(port, m_npuw_model->global_mem_device()); - m_input_tensors.push_back(allocated); - m_input_allocated.insert(allocated->data()); - m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true}; - } // for(inputs) - - // Preallocate output tensors - LOG_INFO("Preallocating output tensors..."); - for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { - LOG_BLOCK(); - const auto& port = m_npuw_model->outputs()[i]; - LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port); - - // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom - const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i); - - LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second); - auto funcall_result_iter = m_funcall_result.find(from_submodel); - - const auto& tensor = - funcall_result_iter != m_funcall_result.end() - ? funcall_result_iter->second // Function calls have their tensors allocated, so just use one - : allocOut(port, m_npuw_model->global_mem_device()); - - m_output_tensors.push_back(tensor); - m_port_to_tensor[port] = TensorStorage{tensor, true}; - } + alloc_io(); connect_subrequests(); - - // Build the parameter/result mapping {{{ - m_subrequests_gio.resize(m_subrequests.size()); - - // Parameters: stage 1... - for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) { - const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i); - if (to_submodel != CompiledModel::NO_LINK) { - std::size_t sub_idx{}, in_idx{}; - std::tie(sub_idx, in_idx) = to_submodel; - m_subrequests_gio.at(sub_idx).global_params[i] = in_idx; - } - } // for(inputs) - - // Parameters: stage 2... - for (auto&& it : m_npuw_model->m_param_subscribers) { - const auto param_idx = it.first; - for (auto&& to_submodel : it.second) { - std::size_t sub_idx{}, in_idx{}; - std::tie(sub_idx, in_idx) = to_submodel; - m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx; - } - } - - // Results - for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { - std::size_t sub_idx{}, out_idx{}; - std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i); - m_subrequests_gio.at(sub_idx).global_results[i] = out_idx; - } - // }}} + init_gio(); for (size_t i = 0; i < m_num_submodels; i++) { LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]..."); @@ -413,6 +353,15 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_outputs_to_submodels_outputs.at(out_idx); + auto funcall_result_iter = m_funcall_result.find(from_submodel); + if (funcall_result_iter != m_funcall_result.end()) { + return funcall_result_iter->second; + } + return IBaseInferRequest::alloc_global_out(out_idx); +} + void ov::npuw::JustInferRequest::connect_subrequests() { LOG_INFO("Connecting subrequests..."); LOG_BLOCK(); @@ -478,33 +427,6 @@ void ov::npuw::JustInferRequest::connect_subrequests() { LOG_INFO("Done"); } -std::vector> ov::npuw::JustInferRequest::query_state() const { - std::vector> variable_states = {}; - for (const auto& request : m_subrequests) { - if (!request) // optimized out - continue; - for (auto&& state : request->query_state()) { - if (!state._so) - state._so = request._so; - variable_states.emplace_back(state); - } - } - return variable_states; -} - -std::vector ov::npuw::JustInferRequest::get_profiling_info() const { - std::vector info; - for (size_t i = 0; i < m_subrequests.size(); ++i) { - if (!m_subrequests[i]) // optimized out - continue; - auto&& subreq_info = m_subrequests[i]->get_profiling_info(); - for (auto&& rec : subreq_info) - rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name; - info.insert(info.end(), subreq_info.begin(), subreq_info.end()); - } - return info; -} - void ov::npuw::JustInferRequest::prepare_for_infer() { LOG_DEBUG("Preparing to infer..."); LOG_BLOCK(); @@ -542,118 +464,36 @@ void ov::npuw::JustInferRequest::start_subrequest(std::size_t idx) { } void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) { - LOG_DEBUG("Binding parameters for Subgraph[" << idx << "]"); - LOG_BLOCK(); - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; const auto real_idx = comp_model_desc.replaced_by.value_or(idx); - const bool do_copy = needs_copy(idx); - const auto& iodesc = m_subrequests_gio.at(idx); - - const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; - const bool is_spatial = proto_comp_model_desc.spatial.has_value(); - - // a list of ports to copy tensors, if needed: FROM -> TO - std::vector, ov::Output>> copy_list; - // pick which subrequest we actually work on here - auto subr = [&]() { - if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) { - LOG_DEBUG("Accessing the pipeline subrequest"); - // The real index of request we need to prepare IS - // the same request which executes now AND - // function_pipelining enabled - select the reserve request. - NPUW_ASSERT(m_funcall_pipeline[real_idx].subrequest); - return m_funcall_pipeline[real_idx].subrequest; - } + if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) { + LOG_DEBUG("Accessing the pipeline subrequest"); + // The real index of request we need to prepare IS + // the same request which executes now AND + // function_pipelining enabled - select the reserve request. + NPUW_ASSERT(m_funcall_pipeline[real_idx].subrequest); + bind_global_params(idx, m_funcall_pipeline[real_idx].subrequest); + } else { // Otherwise: Just a return a subrequest which is in place. // If it is a function call and we have function pipelining ON, // it is still the right subrequest we can use. LOG_DEBUG("Accessing the primary subrequest"); - return m_subrequests[real_idx]; - }(); - - // Check if the given subgraph's input is spatial - auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool { - if (!is_spatial) { - return false; // Early return - } - auto& spatial = proto_comp_model_desc.spatial.value(); - return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool { - return p.idx == sub_in_idx; - }); - }; - - for (auto&& it : iodesc.global_params) { - std::size_t param_idx{}, sub_in_idx{}; - std::tie(param_idx, sub_in_idx) = it; - LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl); - - const auto& g_port = m_npuw_model->inputs()[param_idx]; - const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor; - const auto& s_port = subr->get_inputs()[sub_in_idx]; - LOG_DEBUG("Processing " << g_port << " -> " << s_port << "..."); - LOG_BLOCK(); - if (!is_spatial_param(sub_in_idx)) { - // Input parameter is non-spatial, do normal handling - if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) { - LOG_DEBUG("Will be copied"); - copy_list.emplace_back(g_tnsr, s_port); - } else { - LOG_DEBUG("Will be set"); - subr->set_tensor(s_port, g_tnsr); - } - } else { - // Register for future use - m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr; - } + bind_global_params(idx, m_subrequests[real_idx]); } - - LOG_DEBUG("Running copy..."); - ov::parallel_for(copy_list.size(), [&](std::size_t idx) { - auto& it = copy_list[idx]; - ov::SoPtr dst = subr->get_tensor(it.second); - it.first->copy_to(dst._ptr); - }); - - // Run host-side gather, if required - if (comp_model_desc.host_gather.dst_idx != -1) { - const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx]; - const auto gather = subr->get_tensor(gport); - - const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base]; - const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx]; - const auto lookup = subr->get_tensor(lport); - ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather); - } - - LOG_DEBUG("Done"); } void ov::npuw::JustInferRequest::bind_global_results(std::size_t idx) { - LOG_DEBUG("Binding results for Subgraph[" << idx << "]"); - LOG_BLOCK(); - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; if (comp_model_desc.replaced_by) { // Don't do here - function call will take the right tensor // itself. Note it may be implemented more efficently than now // (and in some cases, the tensor can be pre-set) - LOG_DEBUG("Skipping this too now - function will do it for itself"); + LOG_DEBUG("Skipping bind_glo - function will do it for itself"); return; } - - const auto& iodesc = m_subrequests_gio.at(idx); - for (auto&& it : iodesc.global_results) { - std::size_t result_idx{}, sub_out_idx{}; - std::tie(result_idx, sub_out_idx) = it; - const auto& g_port = m_npuw_model->outputs()[result_idx]; - const auto& s_port = m_subrequests[idx]->get_outputs()[sub_out_idx]; - m_subrequests[idx]->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor); - } - - LOG_DEBUG("Done"); + IBaseInferRequest::bind_global_results(idx, m_subrequests[idx]); } void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) { @@ -737,88 +577,6 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) { LOG_DEBUG("Done"); } -void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) { - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; - - NPUW_ASSERT(comp_model_desc.replaced_by); - const auto real_idx = comp_model_desc.replaced_by.value(); - auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx]; - - // Bind extra parameters from the function's closure - // First, do easy things & delay heavy stuff - std::vector closure_unpack_required; - std::vector closure_copy_required; - - for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) { - auto& closure = comp_model_desc.closure[cidx]; - - const auto closure_param_id = comp_model_desc.param_base + cidx; - - if (func_desc.host_gather.dst_idx != -1 && - static_cast(func_desc.host_gather.dst_idx) == closure_param_id) { - // No need to set/copy the host_gather's closure tensor int - // the subrequest - it is just a dummy. host_gather writes - // to the right buffer directly. - continue; - } - - auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; - if (closure.get_element_type() != iport.get_element_type()) { - // Remember where the unpack is required - closure_unpack_required.push_back(cidx); - } else { - if (needs_copy(idx, cidx)) { - // Remember where copy is requried - closure_copy_required.push_back(cidx); - } else { - // Easy case, just set one to another - request->set_tensor(iport, ov::get_tensor_impl(closure)); - } - } - } // for(closure) - - // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){ - ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) { - auto cidx = closure_copy_required[j]; - auto& closure = comp_model_desc.closure[cidx]; - const auto closure_param_id = comp_model_desc.param_base + cidx; - auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; - auto clparam = request->get_tensor(iport); - ov::get_tensor_impl(closure)->copy_to(clparam._ptr); - }); - // }); // ms_to_run - - for (std::size_t j = 0; j != closure_unpack_required.size(); j++) { - // NB: No need to protect anything here as containers are all - // preallocated and we only access elements under particular (thread - // -local) indices. - auto cidx = closure_unpack_required[j]; - - // FIXME: zerops are stored with absolute indexing, this needs to be aligned - auto& closure = comp_model_desc.closure[cidx]; - - const auto closure_param_id = comp_model_desc.param_base + cidx; - auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; - auto clparam = request->get_tensor(iport); - - if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) { - // Unpacking this weight requires scaling with zero points... - ov::npuw::util::unpack(ov::get_tensor_impl(closure), - ov::get_tensor_impl(comp_model_desc.zerops[cidx]), - ov::get_tensor_impl(comp_model_desc.scales[cidx]), - clparam); - } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) { - // Unpacking this weight requires scaling - ov::npuw::util::unpack(ov::get_tensor_impl(closure), - ov::get_tensor_impl(comp_model_desc.scales[cidx]), - clparam); - } else { - // Unpacking this weight doesn't require scaling - ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam); - } - } -} - void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) { auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; auto real_idx = comp_model_desc.replaced_by.value_or(idx); @@ -1110,24 +868,6 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool } // if (replaced_by) } -ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type, - const ov::Shape& shape, - const std::string& device) { - if (device == "CPU" || ov::shape_size(shape) == 0) { - return ov::get_tensor_impl(ov::Tensor(type, shape)); - } - - // Protect access to shared context(s) - at least among infer requests - auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr; - auto remote_tensor = remote_ctx->create_host_tensor(type, shape); - return ov::get_tensor_impl(ov::make_tensor(remote_tensor)); -} - -ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output& node, - const std::string& device) { - return allocMem(node.get_element_type(), node.get_shape(), device); -} - void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) { get_real_subrequest(idx)->set_callback(std::move(cb)); } @@ -1140,10 +880,6 @@ void ov::npuw::JustInferRequest::cancel_subrequest(std::size_t idx) { m_subrequests[idx]->cancel(); } -std::size_t ov::npuw::JustInferRequest::total_subrequests() const { - return m_subrequests.size(); -} - bool ov::npuw::JustInferRequest::supports_async_pipeline() const { return false; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp index d219f170a8e6bb..a935220b4b8943 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp @@ -23,13 +23,6 @@ namespace npuw { class CompiledModel; class AsyncInferRequest; -using LinkFrom = std::pair; // FIXME: This is a third, if not fourth, definitiion of such structure - -using TensorPtr = ov::SoPtr; - class MemAccessSim { public: explicit MemAccessSim(const std::shared_ptr& compiled_model); @@ -77,11 +70,7 @@ class JustInferRequest final : public IBaseInferRequest { public: explicit JustInferRequest(const std::shared_ptr& compiled_model); - // Query APIs - std::vector> query_state() const override; - std::vector get_profiling_info() const override; - -private: +protected: //////////////////////////////////// // implement IBaseInferRequest void prepare_for_infer() override; @@ -91,11 +80,11 @@ class JustInferRequest final : public IBaseInferRequest { void subscribe_subrequest(std::size_t idx, Completed cb) override; void complete_subrequest(std::size_t idx) override; void cancel_subrequest(std::size_t idx) override; - std::size_t total_subrequests() const override; bool supports_async_pipeline() const override; - void update_subrequest_links(std::size_t idx) override; + TensorPtr alloc_global_out(std::size_t out_idx) override; + //////////////////////////////////// // now own API @@ -104,9 +93,9 @@ class JustInferRequest final : public IBaseInferRequest { void bind_global_parameters(std::size_t idx); void bind_global_results(std::size_t idx); + using IBaseInferRequest::bind_global_results; void function_prologue(std::size_t idx); - void unpack_closure(std::size_t idx, RqPtr request); void unsafe_during(std::size_t real_idx, const std::function& f); void unsafe_infer(std::size_t real_idx); @@ -115,9 +104,6 @@ class JustInferRequest final : public IBaseInferRequest { void connect_subrequests(); void recreate_subrequests(std::size_t idx); - TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device); - TensorPtr allocOut(const ov::Output& node, const std::string& device); - FuncMemMgr m_func_mem_mgr; // Owns memory std::map m_funcall_result; // Provides a convenient link @@ -139,18 +125,6 @@ class JustInferRequest final : public IBaseInferRequest { // initialized. std::vector m_funcall_pipeline; - // This structure tracks how every individual subrequest - // access the model's top-level (global, public, etc) parameters - // and results - struct GlobalIO { - using map_t = std::map; - map_t global_params; // param idx -> input idx - map_t global_results; // result idx -> output idx - }; - std::vector m_subrequests_gio; - - std::unordered_set m_input_allocated; - // Represents spatial run-time info runtime::spatial::Selector::Ptr m_spatial_selector; diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp new file mode 100644 index 00000000000000..90eb62dcc0a8e3 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "unfold_sync_infer_request.hpp" + +#include "compiled_model.hpp" +#include "logging.hpp" +#include "openvino/core/parallel.hpp" + +ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr& compiled_model) + : ov::npuw::IBaseInferRequest(compiled_model) { + // Create infer requests + // Preallocate funcall tensors & substitute function call requests + for (std::size_t i = 0; i < m_num_submodels; i++) { + LOG_INFO("Creating infer request for Subgraph[" << i << "]..."); + LOG_BLOCK(); + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i]; + + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + // no model & no funcall - optimized out, do nothing + LOG_INFO("OPTIMIZED OUT"); + continue; + } + + if (comp_model_desc.replaced_by) { + // Pre-allocate output tensors for this function call + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + if (proto_comp_model_desc.spatial) { + NPUW_ASSERT(false && "Spatial is not supported in unfold"); + } + } // if(replaced_by) + + const auto real_idx = comp_model_desc.replaced_by.value_or(i); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + // NB: UnfoldInferRequest is _NOT_ fail-safe! Fail means fail here + m_subrequests[i] = proto_comp_model_desc.compiled_model->create_infer_request(); + m_subrequest_devices[i] = *proto_comp_model_desc.device_it; + LOG_INFO("DONE"); + } // for(submodels) + + alloc_io(); + + LOG_INFO("Connecting subrequests..."); + LOG_BLOCK(); + for (const auto& kvp : m_npuw_model->m_submodels_input_to_prev_output) { + const auto& subm_idx_to = kvp.first.first; + const auto& port_idx_to = kvp.first.second; + const auto& subm_idx_from = kvp.second.first; + const auto& port_idx_from = kvp.second.second; + + LOG_DEBUG("Subgraph[" << subm_idx_from << "]/" << port_idx_from << " --> " + << "Subgraph[" << subm_idx_to << "]/" << port_idx_to); + NPUW_ASSERT(m_subrequests[subm_idx_from]); // prod request is created + NPUW_ASSERT(m_subrequests[subm_idx_to]); // cons request is created + NPUW_ASSERT(m_subrequests[subm_idx_from]._ptr != m_subrequests[subm_idx_to]._ptr); + + const auto& iport = m_subrequests[subm_idx_to]->get_compiled_model()->inputs()[port_idx_to]; + const auto& oport = m_subrequests[subm_idx_from]->get_compiled_model()->outputs()[port_idx_from]; + const auto& tensor = m_subrequests[subm_idx_from]->get_tensor(oport); + LOG_DEBUG("Set Subgraph[" << subm_idx_to << "]/" << iport << " to Subgraph[" << subm_idx_from << "]/" << oport); + m_subrequests[subm_idx_to]->set_tensor(iport, tensor); + } // for(map) + LOG_INFO("Done"); + + init_gio(); + + for (size_t i = 0; i < m_num_submodels; i++) { + LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]..."); + LOG_BLOCK(); + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i]; + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + continue; // Optimized out + } + unpack_closure(i, m_subrequests[i]); + LOG_VERB("Done"); + } +} + +bool ov::npuw::UnfoldInferRequest::valid_subrequest(std::size_t idx) const { + return m_subrequests.at(idx) != nullptr; +} + +void ov::npuw::UnfoldInferRequest::infer() { + const bool do_async = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>(); + + auto prepare = [&](std::size_t idx) { + if (idx >= m_subrequests.size()) { + return; + } + bind_global_params(idx, m_subrequests[idx]); + bind_global_results(idx, m_subrequests[idx]); + }; + auto wait_and_clear = [](RqPtrs& rqs) { + for (auto&& r : rqs) { + r->wait(); + } + rqs.clear(); + }; + + if (do_async) { + std::size_t past_repl_id = 0u; + RqPtrs previous_requests; + + prepare(0); + for (std::size_t idx = 0; idx < m_num_submodels; idx++) { + auto& subr = m_subrequests[idx]; + if (!subr) { + prepare(idx + 1); + continue; + } + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; + const auto this_repl_id = comp_model_desc.replaced_by.value_or(idx); + if (this_repl_id != past_repl_id) { + // For non-repeating blocks, the above value_or returns idx + // For repeating blocks, it returns the function group id + // If either is not equal to the past_repl_id, make a barrier here + wait_and_clear(previous_requests); + past_repl_id = this_repl_id; + } + subr->start_async(); + previous_requests.push_back(subr); + prepare(idx + 1); + } + wait_and_clear(previous_requests); + } else { + prepare(0); + for (std::size_t idx = 0; idx < m_num_submodels; idx++) { + auto& subr = m_subrequests[idx]; + if (!subr) { + prepare(idx + 1); + continue; + } + subr->start_async(); + prepare(idx + 1); + subr->wait(); + } + } // (async) +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp new file mode 100644 index 00000000000000..76b67571ec4c40 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +#include "base_sync_infer_request.hpp" + +namespace ov { +namespace npuw { + +class UnfoldInferRequest final : public IBaseInferRequest { +public: + explicit UnfoldInferRequest(const std::shared_ptr& compiled_model); + + //////////////////////////////////// + // implement IBaseInferRequest - nether of these are required here + // this hierarchy needs revew + void prepare_for_infer() override {} + bool valid_subrequest(std::size_t idx) const override; + void start_subrequest(std::size_t) override {} + void run_subrequest_for_success(std::size_t, bool&) override {} + void subscribe_subrequest(std::size_t, Completed cb) override {} + void complete_subrequest(std::size_t) override {} + void cancel_subrequest(std::size_t) override {} + bool supports_async_pipeline() const override { + return false; + } + void update_subrequest_links(std::size_t) override {} + +private: + void infer() override; +}; + +} // namespace npuw +} // namespace ov From 76cca6cebfdb9efe21f672960768483c6d0a4000 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Tue, 26 Nov 2024 20:22:27 +0000 Subject: [PATCH 2/3] [NPUW] LazyTensor refactoring (#27108) --- .../intel_npu/src/plugin/npuw/lazy_tensor.cpp | 398 ++++++++---------- .../intel_npu/src/plugin/npuw/lazy_tensor.hpp | 32 +- .../plugin/npuw/partitioning/partitioning.cpp | 27 +- .../npuw/partitioning/patterns/dcoff.cpp | 8 +- 4 files changed, 208 insertions(+), 257 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp index 8a0317a9f714e8..81521222ae6fae 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp @@ -4,41 +4,166 @@ #include "lazy_tensor.hpp" -using ov::npuw::weights::ConcatMeta; -using ov::npuw::weights::ConstPtr; +#include +#include +#include + +#include "logging.hpp" +#include "openvino/runtime/make_tensor.hpp" +#include "util.hpp" + using ov::npuw::weights::LazyTensor; -using ov::npuw::weights::OrigData; -using ov::npuw::weights::Transform; -using ov::npuw::weights::TransformType; -using ov::npuw::weights::UnpackMeta; namespace ov { namespace npuw { namespace weights { +namespace op { +struct Const { + std::shared_ptr node; + + std::size_t hash() const { + std::size_t seed = std::hash()(node->get_data_ptr()) + 0x9e3779b9; + seed ^= node->get_element_type().hash() + 0x9e3779b9; + for (const auto& dim : node->get_shape()) { + seed ^= std::hash()(dim) + 0x9e3779b9; + } + return seed; + } + bool operator==(const Const& other) const { + return (node->get_shape() == other.node->get_shape() && + node->get_element_type() == other.node->get_element_type() && + node->get_data_ptr() == other.node->get_data_ptr()); + } + ov::Tensor eval() const { + return ov::npuw::util::tensor_from_const(node); + } +}; +struct Concat { + std::vector tensors; + std::size_t axis; + + std::size_t hash() const { + std::size_t seed = std::hash()(axis) + 0x9e3779b9; + for (auto& lt : tensors) { + seed ^= lt.get_hash() + 0x9e3779b9; + } + return seed; + } + bool operator==(const Concat& other) const { + return (axis == other.axis && tensors == other.tensors); + } + ov::Tensor eval() const { + std::vector to_concat; + for (const auto& lt : tensors) { + to_concat.push_back(lt.eval()); + } + return ov::npuw::util::concat(to_concat, axis); + } +}; + +struct Unpack { + LazyTensor w, z, s; + ov::element::Type type; + ov::Shape shape; + + std::size_t hash() const { + std::size_t seed = w.get_hash() + 0x9e3779b9; + seed ^= z.get_hash() + 0x9e3779b9; + seed ^= s.get_hash() + 0x9e3779b9; + seed ^= type.hash() + 0x9e3779b9; + for (const auto& dim : shape) { + seed ^= std::hash()(dim) + 0x9e3779b9; + } + return seed; + } + bool operator==(const Unpack& other) const { + return (type == other.type && shape == other.shape && w == other.w && z == other.z && s == other.s); + } + ov::Tensor eval() const { + const auto& gti = ov::get_tensor_impl; + const auto& tw = w.eval(); + const auto& tz = z.eval(); + const auto& ts = s.eval(); + NPUW_ASSERT(tw); + ov::Tensor dst(type, shape); + if (tw && tz && ts) { + ov::npuw::util::unpack(gti(tw), gti(tz), gti(ts), gti(dst)); + } else if (tw && ts) { + ov::npuw::util::unpack(gti(tw), gti(ts), gti(dst)); + } else { + NPUW_ASSERT(false && "Unsupported combination"); + } + return dst; + } +}; +struct Permute { + LazyTensor tensor; + std::vector axes; + + std::size_t hash() const { + std::size_t seed = tensor.get_hash() + 0x9e3779b9; + for (const auto& axis : axes) { + seed ^= std::hash()(axis) + 0x9e3779b9; + } + return seed; + } + bool operator==(const Permute& other) const { + return (axes == other.axes && tensor == other.tensor); + } + ov::Tensor eval() const { + return ov::npuw::util::permute(tensor.eval(), axes); + } +}; +struct Convert { + LazyTensor tensor; + ov::element::Type type; + + std::size_t hash() const { + std::size_t seed = type.hash() + 0x9e3779b9; + seed ^= tensor.get_hash() + 0x9e3779b9; + return seed; + } + bool operator==(const Convert& other) const { + return (type == other.type && tensor == other.tensor); + } + ov::Tensor eval() const { + NPUW_ASSERT(ov::element::f16 == type); + return ov::npuw::util::to_f16(tensor.eval()); + } +}; +} // namespace op + +using Transform = std::variant; struct LazyTensorImpl { public: LazyTensorImpl() = default; - LazyTensorImpl(const TransformType& type, const Transform& transform); - - bool operator==(const LazyTensorImpl& other) const; + explicit LazyTensorImpl(Transform&& t); ov::Tensor eval() const; - ov::Tensor get_orig_tensor() const; - + bool operator==(const LazyTensorImpl& other) const; std::size_t get_hash() const; - bool has_transformations() const; - - std::shared_ptr m_parent = nullptr; - std::pair m_transform; + Transform m_transform; std::size_t m_hash = 0; +}; + +} // namespace weights +} // namespace npuw +} // namespace ov + +using namespace ov::npuw::weights::op; +using ov::npuw::weights::LazyTensorImpl; +using ov::npuw::weights::Transform; - void* m_orig_data = nullptr; - ov::Shape m_orig_shape; - ov::element::Type m_orig_type; +// std::visit helper +template +struct overloaded : Ts... { + using Ts::operator()...; }; +template +overloaded(Ts...) -> overloaded; std::size_t LazyTensorImpl::get_hash() const { // Already calculated @@ -46,120 +171,23 @@ std::size_t LazyTensorImpl::get_hash() const { return m_hash; } - // Get parent's hash + // Get hash std::size_t seed = 0; - if (m_parent) { - seed = m_parent->get_hash(); - } else { - seed = std::hash()(m_orig_data) + 0x9e3779b9; - for (const auto& dim : m_orig_shape) { - seed ^= std::hash()(dim) + 0x9e3779b9; - } - seed ^= m_orig_type.hash() + 0x9e3779b9; - } - - // Combine with this hash - seed ^= std::hash()(static_cast(m_transform.first)) + 0x9e3779b9; - if (m_transform.first == TransformType::PERMUTE) { - const auto& axes = std::get>(m_transform.second); - for (const auto& axis : axes) { - seed ^= std::hash()(axis) + 0x9e3779b9; - } - } else if (m_transform.first == TransformType::CONCAT) { - const auto& axis = std::get(m_transform.second).second; - seed ^= std::hash()(axis) + 0x9e3779b9; - for (auto& lt : std::get(m_transform.second).first) { - seed ^= lt.get_hash() + 0x9e3779b9; - } - } else if (m_transform.first == TransformType::UNPACK) { - const auto& unpack_meta = std::get(m_transform.second); - seed ^= std::get<0>(unpack_meta).get_hash() + 0x9e3779b9; - seed ^= std::get<1>(unpack_meta).get_hash() + 0x9e3779b9; - seed ^= std::get<2>(unpack_meta).get_hash() + 0x9e3779b9; - for (const auto& dim : std::get<3>(unpack_meta)) { - seed ^= std::hash()(dim) + 0x9e3779b9; - } - seed ^= std::get<4>(unpack_meta).hash() + 0x9e3779b9; - } + std::visit(overloaded{[&seed](const auto& op) { + seed ^= op.hash(); + }}, + m_transform); return seed; } -} // namespace weights -} // namespace npuw -} // namespace ov - -using ov::npuw::weights::LazyTensorImpl; - -LazyTensorImpl::LazyTensorImpl(const TransformType& type, const Transform& transform) { - if (type == TransformType::THIS && std::holds_alternative(transform)) { - m_transform = std::make_pair(type, transform); - ov::Tensor tensor; - if (std::holds_alternative(std::get(transform))) { - tensor = ov::npuw::util::tensor_from_const(std::get(std::get(transform))); - } else { - tensor = std::get(std::get(transform)); - if (!tensor) { - // Don't set anything - return; - } - } - m_orig_data = tensor.data(); - m_orig_shape = tensor.get_shape(); - m_orig_type = tensor.get_element_type(); - } else if (type == TransformType::CONCAT && std::holds_alternative(transform)) { - m_transform = std::make_pair(type, transform); - } else if (type == TransformType::UNPACK && std::holds_alternative(transform)) { - m_transform = std::make_pair(type, transform); - } else { - NPUW_ASSERT(false); - } +LazyTensorImpl::LazyTensorImpl(Transform&& t) { + m_transform = std::move(t); m_hash = get_hash(); } bool LazyTensorImpl::operator==(const LazyTensorImpl& other) const { - if (m_hash != other.m_hash || m_orig_data != other.m_orig_data || m_orig_shape != other.m_orig_shape || - m_orig_type != other.m_orig_type || m_transform.first != other.m_transform.first) { - return false; - } - - switch (m_transform.first) { - case TransformType::THIS: - // everything is already compared above - skip - break; - case TransformType::CONVERT: - // everything is already compared above - skip - break; - case TransformType::PERMUTE: - if (std::get>(m_transform.second) != - std::get>(other.m_transform.second)) { - return false; - } - break; - case TransformType::CONCAT: - if (std::get(m_transform.second) != std::get(other.m_transform.second)) { - return false; - } - break; - case TransformType::UNPACK: - if (std::get(m_transform.second) != std::get(other.m_transform.second)) { - return false; - } - break; - default: - NPUW_ASSERT(false); - break; - } - - if ((m_parent && !other.m_parent) || (!m_parent && other.m_parent)) { - return false; - } - - if (m_parent && other.m_parent) { - return *m_parent.get() == *other.m_parent.get(); - } - - return true; + return m_hash == other.m_hash && m_transform == other.m_transform; } ov::Tensor LazyTensorImpl::eval() const { @@ -173,82 +201,37 @@ ov::Tensor LazyTensorImpl::eval() const { Perhaps it should be done after model compilation and not handled here. */ - // Process the initial tensor - either from Const or from Concat - if (!m_parent) { - if (m_transform.first == TransformType::THIS) { - return get_orig_tensor(); - } else if (m_transform.first == TransformType::CONCAT) { - std::vector to_concat; - for (const auto& lt : std::get(m_transform.second).first) { - // Sanity check - NPUW_ASSERT(!lt.has_transformations()); - to_concat.push_back(lt.get_orig_tensor()); - } - return ov::npuw::util::concat(to_concat, std::get(m_transform.second).second); - } else if (m_transform.first == TransformType::UNPACK) { - const auto& unpack_meta = std::get(m_transform.second); - const auto& cw = std::get<0>(unpack_meta); - const auto& cz = std::get<1>(unpack_meta); - const auto& cs = std::get<2>(unpack_meta); - const auto& shape = std::get<3>(unpack_meta); - const auto& type = std::get<4>(unpack_meta); - - // Note: unpacking done in-place since the original tensor is empty at this point - NPUW_ASSERT(!cw.has_transformations()); - NPUW_ASSERT(!cs.has_transformations()); - // FIXME: Ugly check concat case as well since cz might be not set - if (cz.has_transformations()) { - NPUW_ASSERT(false); - } - - const auto& gti = ov::get_tensor_impl; - const auto& tw = cw.get_orig_tensor(); - const auto& tz = cz.get_orig_tensor(); - const auto& ts = cs.get_orig_tensor(); - ov::Tensor dst(type, shape); - if (tw && tz && ts) { - ov::npuw::util::unpack(gti(tw), gti(tz), gti(ts), gti(dst)); - } else if (tw && ts) { - ov::npuw::util::unpack(gti(tw), gti(ts), gti(dst)); - } else { - NPUW_ASSERT(false && "Unsupported combination"); - } - return dst; - } else { - NPUW_ASSERT(false); - } - } - - // Process transformation - switch (m_transform.first) { - case TransformType::PERMUTE: - return ov::npuw::util::permute(m_parent->eval(), std::get>(m_transform.second)); - case TransformType::CONVERT: - return ov::npuw::util::to_f16(m_parent->eval()); - default: - NPUW_ASSERT(false); - } - - NPUW_ASSERT(false); - return ov::Tensor(); + ov::Tensor result = std::visit(overloaded{[](const auto& op) { + return op.eval(); + }}, + m_transform); + NPUW_ASSERT(result); + return result; } -ov::Tensor LazyTensorImpl::get_orig_tensor() const { - // Sanity check - NPUW_ASSERT(!has_transformations()); - if (std::holds_alternative(std::get(m_transform.second))) { - return ov::npuw::util::tensor_from_const(std::get(std::get(m_transform.second))); - } - return std::get(std::get(m_transform.second)); +LazyTensor::LazyTensor(const std::shared_ptr& const_ptr) + : m_impl(std::make_shared(op::Const{const_ptr})) {} +LazyTensor::LazyTensor(const std::vector& to_concat, const std::size_t axis) + : m_impl(std::make_shared(op::Concat{to_concat, axis})) {} +LazyTensor::LazyTensor(const LazyTensor& cw, + const LazyTensor& cz, + const LazyTensor& cs, + const ov::element::Type& type, + const ov::Shape& shape) + : m_impl(std::make_shared(op::Unpack{cw, cz, cs, type, shape})) {} + +LazyTensor LazyTensor::permute(const std::vector& axes) { + LazyTensor new_lt; + new_lt.m_impl = std::make_shared(op::Permute{*this, axes}); + return new_lt; } -bool LazyTensorImpl::has_transformations() const { - return m_transform.first != TransformType::THIS; +LazyTensor LazyTensor::convert(const ov::element::Type& type) { + LazyTensor new_lt; + new_lt.m_impl = std::make_shared(op::Convert{*this, type}); + return new_lt; } -LazyTensor::LazyTensor(const TransformType& type, const Transform& transform) - : m_impl(std::make_shared(type, transform)) {} - bool LazyTensor::operator==(const LazyTensor& other) const { return *m_impl.get() == *other.m_impl.get(); } @@ -257,37 +240,20 @@ bool LazyTensor::operator!=(const LazyTensor& other) const { return !(*m_impl.get() == *other.m_impl.get()); } -void LazyTensor::update(const TransformType& type, const Transform& transform) { - const auto& curr = m_impl; - auto new_lt = std::make_shared(); - - new_lt->m_orig_data = curr->m_orig_data; - new_lt->m_orig_shape = curr->m_orig_shape; - new_lt->m_orig_type = curr->m_orig_type; - - new_lt->m_transform = std::make_pair(type, transform); - new_lt->m_parent = curr; - new_lt->m_hash = new_lt->get_hash(); - - m_impl = new_lt; -} - ov::Tensor LazyTensor::eval() const { + if (!m_impl) { + return ov::Tensor(); + } return m_impl->eval(); } -ov::Tensor LazyTensor::get_orig_tensor() const { - return m_impl->get_orig_tensor(); -} - std::size_t LazyTensor::get_hash() const { + if (!m_impl) { + return 0; + } return m_impl->get_hash(); } std::size_t LazyTensor::Hash::operator()(const LazyTensor& lt) const { return lt.get_hash(); } - -bool LazyTensor::has_transformations() const { - return m_impl->has_transformations(); -} diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp index 5cdeeba058e45f..365d9d636872b8 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp @@ -5,33 +5,17 @@ #pragma once #include -#include -#include -#include -#include -#include "logging.hpp" -#include "openvino/runtime/make_tensor.hpp" +#include "openvino/op/constant.hpp" #include "openvino/runtime/tensor.hpp" -#include "util.hpp" namespace ov { namespace npuw { namespace weights { - -enum class TransformType : int { THIS, PERMUTE, CONVERT, CONCAT, UNPACK }; - // Forward declaration class LazyTensor; struct LazyTensorImpl; -using ConcatMeta = std::pair, std::size_t>; -using UnpackMeta = std::tuple; -using ConstPtr = std::shared_ptr; -using OrigData = std::variant; - -using Transform = std::variant, std::monostate, ConcatMeta, UnpackMeta>; - class LazyTensor { public: class Hash { @@ -40,17 +24,23 @@ class LazyTensor { }; LazyTensor() = default; - LazyTensor(const TransformType& type, const Transform& transform); + LazyTensor(const std::shared_ptr& const_ptr); + LazyTensor(const std::vector& to_concat, const std::size_t axis); // construct from concat + LazyTensor(const LazyTensor& cw, + const LazyTensor& cz, + const LazyTensor& cs, + const ov::element::Type& type, + const ov::Shape& shape); // construct from unpack + + LazyTensor permute(const std::vector& axes); + LazyTensor convert(const ov::element::Type& type); bool operator==(const LazyTensor& other) const; bool operator!=(const LazyTensor& other) const; - void update(const TransformType& type, const Transform& transform); ov::Tensor eval() const; - ov::Tensor get_orig_tensor() const; std::size_t get_hash() const; - bool has_transformations() const; private: std::shared_ptr m_impl = nullptr; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 99705fef30e8a8..2ff41be4c19f78 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1525,8 +1525,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) { LOG_DEBUG("Register " << prod_output << " in the function closure"); funcall._lazy_closure.push_back( - LazyTensor(TransformType::THIS, - std::static_pointer_cast(input_node))); // (n)/1/i/c + LazyTensor(std::static_pointer_cast(input_node))); // (n)/1/i/c } else if (ov::op::util::is_parameter(input_node)) { LOG_DEBUG("Handling a Parameter input " << prod_output); LOG_BLOCK(); @@ -1695,8 +1694,7 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) { LOG_DEBUG("Register " << prod_output << " in the function closure[" << param_idx << "] (via prototype " << proto_layer_name << ")"); funcall._lazy_closure[param_idx - function._param_offset] = - LazyTensor(TransformType::THIS, - std::static_pointer_cast(input_node)); // (t)/1/c + LazyTensor(std::static_pointer_cast(input_node)); // (t)/1/c } } // for (inputs) } // for(nodes) @@ -1765,7 +1763,7 @@ void Partitioner::optimize(const std::string& func_name) { auto closure_idx = param_idx - f._param_offset; ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) { auto& funcall = func_group.refs[f_idx].get(); - funcall._lazy_closure[closure_idx].update(TransformType::PERMUTE, p.second); + funcall._lazy_closure[closure_idx] = funcall._lazy_closure[closure_idx].permute(p.second); }); } }; @@ -1775,7 +1773,7 @@ void Partitioner::optimize(const std::string& func_name) { auto closure_idx = param_idx - f._param_offset; ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) { auto& funcall = func_group.refs[f_idx].get(); - funcall._lazy_closure[closure_idx].update(TransformType::CONVERT, std::monostate{}); + funcall._lazy_closure[closure_idx] = funcall._lazy_closure[closure_idx].convert(ov::element::f16); }); } }; @@ -1830,15 +1828,12 @@ void Partitioner::optimize(const std::string& func_name) { std::vector to_concat; // Fill tensor vector for (auto&& cidx : to_concat_idx) { - // FIXME: Assuming here concat goes first and other transformations later. - // This allows to store ov::Tensor and ignore their potential history of transformations - NPUW_ASSERT(!funcall._lazy_closure[cidx].has_transformations()); to_concat.push_back(funcall._lazy_closure[cidx]); } // Note: we can ignore updating funcall._lazy_closure[cidx] here since those LazyTensors will be gone // and the new one added into the vector if (!to_concat.empty()) { - funcall._lazy_closure.push_back(LazyTensor(TransformType::CONCAT, std::make_pair(to_concat, axis))); + funcall._lazy_closure.push_back(LazyTensor(to_concat, axis)); // Some of the tensors might be in closure - preserve it's 1:1 idx mapping with _lazy_closure funcall._closure.push_back(ov::Tensor()); } @@ -1865,17 +1860,11 @@ void Partitioner::optimize(const std::string& func_name) { ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) { auto& funcall = func_group.refs[f_idx].get(); - // FIXME: assuming no transformations were applied to the tensor - since we are utilizing the original - // ov::Tensor below LazyTensor cw = funcall._lazy_closure[w_idx - f._param_offset]; - LazyTensor cz = z_idx != -1 ? funcall._lazy_closure[z_idx - f._param_offset] - : LazyTensor(TransformType::THIS, ov::Tensor()); + LazyTensor cz = z_idx != -1 ? funcall._lazy_closure[z_idx - f._param_offset] : LazyTensor(); LazyTensor cs = funcall._lazy_closure[s_idx - f._param_offset]; - - // FIXME: currently there is an issue that we don't share such tensor between head and tail funcall._lazy_closure.push_back( - LazyTensor(TransformType::UNPACK, - std::make_tuple(cw, cz, cs, p.first->get_shape(), p.first->get_element_type()))); + LazyTensor(cw, cz, cs, p.first->get_element_type(), p.first->get_shape())); // Some of the tensors might be in closure - preserve it's 1:1 idx mapping with _lazy_closure funcall._closure.push_back(ov::Tensor()); }); @@ -1899,7 +1888,7 @@ void Partitioner::optimize(const std::string& func_name) { // Based on our logic (when tensors get transferred from lazy tensors via bank // to the closure), this tensor should be non-empty to avoid this process. funcall.get()._closure.push_back(ov::Tensor(new_elem_type, new_shape)); - funcall.get()._lazy_closure.push_back(LazyTensor(TransformType::THIS, ov::Tensor())); + funcall.get()._lazy_closure.push_back(LazyTensor()); } } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp index 60f705a0c8f26c..641ee7690f4d34 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp @@ -94,8 +94,14 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) { } else if (ban_list.find(param) == ban_list.end()) { // If it's not in the ban list, it's an OK parameter and should be kept LOG_DEBUG("This is an OK parameter, will be kept"); - m.weights_to_unpack.insert(i - fbody._param_offset); m.closure_remap.push_back(i - fbody._param_offset); + + // Check if unpack is indeed required + const auto& type = param->get_element_type(); + if (type == ov::element::i4 || type == ov::element::u4 || type == ov::element::i8 || + type == ov::element::u8) { + m.weights_to_unpack.insert(i - fbody._param_offset); + } } // Process zero points for parameters From cbb3760568891af2cb8c8938793cb1d96e51e11f Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Fri, 29 Nov 2024 12:50:14 +0000 Subject: [PATCH 3/3] NPUW memory and L0 pipeline hotfixes --- .../intel_npu/src/plugin/npuw/compiled_model.cpp | 5 +++-- .../src/plugin/npuw/partitioning/patterns/dcoff.cpp | 10 ++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 8770dee0d68eea..e3cffa91bc681a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -705,8 +705,9 @@ std::shared_ptr ov::npuw::CompiledModel::create_sync_infe const auto num_submodels = m_compiled_submodels.size(); for (std::size_t idx = 0u; idx < num_submodels; idx++) { const auto& comp_model_desc = m_compiled_submodels[idx]; - if (!comp_model_desc.replaced_by.has_value()) { - // not a funcall, do nothing + if (!comp_model_desc.replaced_by.has_value() || comp_model_desc.forced_to_fcall) { + // not a funcall, do nothing, or a subgraph that was forced to funcall + // (a 1-call function) - skip continue; } const auto real_idx = comp_model_desc.replaced_by.value(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp index 641ee7690f4d34..61963e2aea5ca6 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp @@ -96,12 +96,10 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) { LOG_DEBUG("This is an OK parameter, will be kept"); m.closure_remap.push_back(i - fbody._param_offset); - // Check if unpack is indeed required - const auto& type = param->get_element_type(); - if (type == ov::element::i4 || type == ov::element::u4 || type == ov::element::i8 || - type == ov::element::u8) { - m.weights_to_unpack.insert(i - fbody._param_offset); - } + // FIXME: type should be queried from a lazy tensor + // and compared against param->get_element_type() + // to decide 100% + m.weights_to_unpack.insert(i - fbody._param_offset); } // Process zero points for parameters