From 3d8b5eb4e20793e2af707cdbe5f776bf41b2506e Mon Sep 17 00:00:00 2001
From: Dmitry Matveev <dmitry.matveev@intel.com>
Date: Thu, 21 Nov 2024 15:18:20 +0000
Subject: [PATCH 1/3] NPUW: Unfold infer requests (#27319)

### Details:
 - *item1*
 - *...*

### Tickets:
 - E-140517
---
 .../src/al/include/intel_npu/config/npuw.hpp  |   1 +
 .../intel_npu/npuw_private_properties.hpp     |   8 +
 .../intel_npu/src/al/src/config/npuw.cpp      |   1 +
 .../plugin/npuw/base_sync_infer_request.cpp   | 291 ++++++++++++++++-
 .../plugin/npuw/base_sync_infer_request.hpp   |  37 ++-
 .../src/plugin/npuw/compiled_model.cpp        |  97 ++++--
 .../src/plugin/npuw/compiled_model.hpp        |  14 +-
 .../plugin/npuw/just_sync_infer_request.cpp   | 308 ++----------------
 .../plugin/npuw/just_sync_infer_request.hpp   |  34 +-
 .../plugin/npuw/unfold_sync_infer_request.cpp | 140 ++++++++
 .../plugin/npuw/unfold_sync_infer_request.hpp |  42 +++
 11 files changed, 621 insertions(+), 352 deletions(-)
 create mode 100644 src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
 create mode 100644 src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
index 7b0dab3d16da3c..edd5b1367c217f 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -56,6 +56,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
 DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
 DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
+DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
 DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
 DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
 DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
index 67dce9621bfb4e..d7761979339eb5 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -279,6 +279,14 @@ static constexpr ov::Property<bool> parallel_compilation{"NPUW_PARALLEL_COMPILE"
  */
 static constexpr ov::Property<bool> funcall_async{"NPUW_FUNCALL_ASYNC"};
 
+/**
+ * @brief
+ * Type: boolean
+ * Create individual infer requests for partitiongs, even repeating.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> unfold_ireqs{"NPUW_UNFOLD_IREQS"};
+
 namespace accuracy {
 /**
  * @brief
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
index 6a519a0f754a32..a4478ba3c9dcd2 100644
--- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -38,6 +38,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_FUNCALL_FOR_ALL>();
     desc.add<NPUW_PARALLEL_COMPILE>();
     desc.add<NPUW_FUNCALL_ASYNC>();
+    desc.add<NPUW_UNFOLD_IREQS>();
     desc.add<NPUW_WEIGHTS_BANK>();
     desc.add<NPUW_WEIGHTS_BANK_ALLOC>();
     desc.add<NPUW_CACHE_DIR>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
index 216b1a35b4315c..77d000cb415de7 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
@@ -7,6 +7,7 @@
 #include "compiled_model.hpp"
 #include "intel_npu/config/npuw.hpp"
 #include "logging.hpp"
+#include "openvino/core/parallel.hpp"
 #include "util.hpp"
 
 ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
@@ -58,12 +59,8 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
             LOG_INFO("- Trying next device...");
             comp_model_desc.device_it++;
             can_try_again = m_npuw_model->compile_for_success(id);
-            if (can_try_again) {
-                if (recompiled)
-                    *recompiled = true;
-                // Probably shouldn't be called all the time, but only if
-                // I/O submodel is affected
-                m_npuw_model->reset_io();
+            if (can_try_again && recompiled) {
+                *recompiled = true;
             }
         }
     }  // while(!new_ireq && can_try_again)
@@ -178,6 +175,33 @@ void ov::npuw::IBaseInferRequest::check_tensors() const {
     return;
 }
 
+std::vector<ov::SoPtr<ov::IVariableState>> ov::npuw::IBaseInferRequest::query_state() const {
+    std::vector<ov::SoPtr<ov::IVariableState>> variable_states = {};
+    for (const auto& request : m_subrequests) {
+        if (!request)  // optimized out
+            continue;
+        for (auto&& state : request->query_state()) {
+            if (!state._so)
+                state._so = request._so;
+            variable_states.emplace_back(state);
+        }
+    }
+    return variable_states;
+}
+
+std::vector<ov::ProfilingInfo> ov::npuw::IBaseInferRequest::get_profiling_info() const {
+    std::vector<ov::ProfilingInfo> info;
+    for (size_t i = 0; i < m_subrequests.size(); ++i) {
+        if (!m_subrequests[i])  // optimized out
+            continue;
+        auto&& subreq_info = m_subrequests[i]->get_profiling_info();
+        for (auto&& rec : subreq_info)
+            rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name;
+        info.insert(info.end(), subreq_info.begin(), subreq_info.end());
+    }
+    return info;
+}
+
 void ov::npuw::IBaseInferRequest::infer() {
     m_now_idx.reset();
     prepare_for_infer();
@@ -209,6 +233,261 @@ void ov::npuw::IBaseInferRequest::infer() {
     m_now_idx.reset();
 }
 
+std::size_t ov::npuw::IBaseInferRequest::total_subrequests() const {
+    return m_subrequests.size();
+}
+
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocMem(const ov::element::Type type,
+                                                          const ov::Shape& shape,
+                                                          const std::string& device) {
+    if (device == "CPU" || ov::shape_size(shape) == 0) {
+        return ov::get_tensor_impl(ov::Tensor(type, shape));
+    }
+
+    auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
+    auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
+    return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
+}
+
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocOut(const ov::Output<const ov::Node>& node,
+                                                          const std::string& device) {
+    return allocMem(node.get_element_type(), node.get_shape(), device);
+}
+
+void ov::npuw::IBaseInferRequest::alloc_io() {
+    // Preallocate input tensors
+    LOG_INFO("Preallocating input tensors...");
+    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
+        const auto& port = m_npuw_model->inputs()[i];
+        ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
+        m_input_tensors.push_back(allocated);
+        m_input_allocated.insert(allocated->data());
+        m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
+    }  // for(inputs)
+
+    // Preallocate output tensors
+    LOG_INFO("Preallocating output tensors...");
+    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
+        LOG_BLOCK();
+        const auto& port = m_npuw_model->outputs()[i];
+        LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);
+
+        // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
+        const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
+        LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
+
+        auto tensor = alloc_global_out(i);
+        m_output_tensors.push_back(tensor);
+        m_port_to_tensor[port] = TensorStorage{tensor, true};
+    }
+}
+
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) {
+    const auto& port = m_npuw_model->outputs().at(out_idx);
+    return allocOut(port, m_npuw_model->global_mem_device());
+}
+
+void ov::npuw::IBaseInferRequest::init_gio() {
+    // Build the parameter/result mapping
+    m_subrequests_gio.resize(m_subrequests.size());
+
+    // Parameters: stage 1...
+    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
+        const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
+        if (to_submodel != CompiledModel::NO_LINK) {
+            std::size_t sub_idx{}, in_idx{};
+            std::tie(sub_idx, in_idx) = to_submodel;
+            m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
+        }
+    }  // for(inputs)
+
+    // Parameters: stage 2...
+    for (auto&& it : m_npuw_model->m_param_subscribers) {
+        const auto param_idx = it.first;
+        for (auto&& to_submodel : it.second) {
+            std::size_t sub_idx{}, in_idx{};
+            std::tie(sub_idx, in_idx) = to_submodel;
+            m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx;
+        }
+    }
+
+    // Results
+    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
+        std::size_t sub_idx{}, out_idx{};
+        std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
+        m_subrequests_gio.at(sub_idx).global_results[i] = out_idx;
+    }
+}
+
+void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request) {
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+
+    NPUW_ASSERT(comp_model_desc.replaced_by);
+    const auto real_idx = comp_model_desc.replaced_by.value();
+    auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx];
+
+    // Bind extra parameters from the function's closure
+    // First, do easy things & delay heavy stuff
+    std::vector<std::size_t> closure_unpack_required;
+    std::vector<std::size_t> closure_copy_required;
+
+    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
+        auto& closure = comp_model_desc.closure[cidx];
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+
+        if (m_npuw_model->is_gather_closure(idx, cidx)) {
+            // No need to set/copy the host_gather's closure tensor int
+            // the subrequest - it is just a dummy. host_gather writes
+            // to the right buffer directly.
+            continue;
+        }
+
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        if (m_npuw_model->unpack_required(idx, cidx)) {
+            // Remember where the unpack is required
+            closure_unpack_required.push_back(cidx);
+        } else {
+            if (needs_copy(idx, cidx)) {
+                // Remember where copy is requried
+                closure_copy_required.push_back(cidx);
+            } else {
+                // Easy case, just set one to another
+                request->set_tensor(iport, ov::get_tensor_impl(closure));
+            }
+        }
+    }  // for(closure)
+
+    // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
+    ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
+        auto cidx = closure_copy_required[j];
+        auto& closure = comp_model_desc.closure[cidx];
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        auto clparam = request->get_tensor(iport);
+        ov::get_tensor_impl(closure)->copy_to(clparam._ptr);
+    });
+    // }); // ms_to_run
+
+    for (std::size_t j = 0; j != closure_unpack_required.size(); j++) {
+        // NB: No need to protect anything here as containers are all
+        // preallocated and we only access elements under particular (thread
+        // -local) indices.
+        auto cidx = closure_unpack_required[j];
+
+        // FIXME: zerops are stored with absolute indexing, this needs to be aligned
+        auto& closure = comp_model_desc.closure[cidx];
+
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        auto clparam = request->get_tensor(iport);
+
+        if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) {
+            // Unpacking this weight requires scaling with zero points...
+            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
+                                   ov::get_tensor_impl(comp_model_desc.zerops[cidx]),
+                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
+                                   clparam);
+        } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) {
+            // Unpacking this weight requires scaling
+            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
+                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
+                                   clparam);
+        } else {
+            // Unpacking this weight doesn't require scaling
+            ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam);
+        }
+    }
+}
+
+void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr request) {
+    LOG_DEBUG("Binding parameters for Subgraph[" << idx << "]");
+    LOG_BLOCK();
+
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+    const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
+
+    const bool do_copy = needs_copy(idx);
+    const auto& iodesc = m_subrequests_gio.at(idx);
+
+    const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    const bool is_spatial = proto_comp_model_desc.spatial.has_value();
+
+    // a list of ports to copy tensors, if needed: FROM -> TO
+    std::vector<std::pair<ov::SoPtr<ov::ITensor>, ov::Output<const ov::Node>>> copy_list;
+
+    // Check if the given subgraph's input is spatial
+    auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool {
+        if (!is_spatial) {
+            return false;  // Early return
+        }
+        auto& spatial = proto_comp_model_desc.spatial.value();
+        return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool {
+            return p.idx == sub_in_idx;
+        });
+    };
+
+    for (auto&& it : iodesc.global_params) {
+        std::size_t param_idx{}, sub_in_idx{};
+        std::tie(param_idx, sub_in_idx) = it;
+        LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl);
+
+        const auto& g_port = m_npuw_model->inputs()[param_idx];
+        const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
+        const auto& s_port = request->get_inputs()[sub_in_idx];
+        LOG_DEBUG("Processing " << g_port << " -> " << s_port << "...");
+        LOG_BLOCK();
+        if (!is_spatial_param(sub_in_idx)) {
+            // Input parameter is non-spatial, do normal handling
+            if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) {
+                LOG_DEBUG("Will be copied");
+                copy_list.emplace_back(g_tnsr, s_port);
+            } else {
+                LOG_DEBUG("Will be set");
+                request->set_tensor(s_port, g_tnsr);
+            }
+        } else {
+            // Register for future use
+            m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr;
+        }
+    }
+
+    LOG_DEBUG("Running copy...");
+    ov::parallel_for(copy_list.size(), [&](std::size_t idx) {
+        auto& it = copy_list[idx];
+        ov::SoPtr<ov::ITensor> dst = request->get_tensor(it.second);
+        it.first->copy_to(dst._ptr);
+    });
+
+    // Run host-side gather, if required
+    if (comp_model_desc.host_gather.dst_idx != -1) {
+        const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx];
+        const auto gather = request->get_tensor(gport);
+
+        const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
+        const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
+        const auto lookup = request->get_tensor(lport);
+        ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather);
+    }
+
+    LOG_DEBUG("Done");
+}
+
+void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr request) {
+    LOG_DEBUG("Binding results for Subgraph[" << idx << "]");
+    LOG_BLOCK();
+
+    const auto& iodesc = m_subrequests_gio.at(idx);
+    for (auto&& it : iodesc.global_results) {
+        std::size_t result_idx{}, sub_out_idx{};
+        std::tie(result_idx, sub_out_idx) = it;
+        const auto& g_port = m_npuw_model->outputs()[result_idx];
+        const auto& s_port = request->get_outputs()[sub_out_idx];
+        request->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
+    }
+
+    LOG_DEBUG("Done");
+}
+
 void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
     const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
     const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
index 6be64d676d6149..ae24dcfee11f9d 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
@@ -19,8 +19,15 @@
 namespace ov {
 namespace npuw {
 
+using TensorPtr = ov::SoPtr<ov::ITensor>;
+
 class CompiledModel;
 
+using LinkFrom = std::pair<std::size_t /* Subrequest index */
+                           ,
+                           std::size_t /* Subrequest output index */
+                           >;          // FIXME: This is a third, if not fourth, definitiion of such structure
+
 // This interface is provided to npuw::AsyncInferRequest to manage the
 // individual subrequests' execution
 class IBaseInferRequest : public ov::ISyncInferRequest {
@@ -40,6 +47,10 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
 
     void check_tensors() const override;
 
+    // Query APIs - some default implementations here
+    std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
+    std::vector<ov::ProfilingInfo> get_profiling_info() const override;
+
     using sptr = std::shared_ptr<IBaseInferRequest>;
     using Completed = std::function<void(std::exception_ptr)>;
 
@@ -50,7 +61,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     virtual void run_subrequest_for_success(std::size_t idx, bool& failover) = 0;
     virtual void complete_subrequest(std::size_t idx) = 0;
     virtual void cancel_subrequest(std::size_t idx) = 0;
-    virtual std::size_t total_subrequests() const = 0;
+    virtual std::size_t total_subrequests() const;
     virtual bool supports_async_pipeline() const = 0;
 
 protected:
@@ -107,8 +118,32 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     };
     std::vector<SpatialIO> m_spatial_io;
 
+    // This structure tracks how every individual subrequest
+    // access the model's top-level (global, public, etc) parameters
+    // and results. Again, is managed by subclasses
+    struct GlobalIO {
+        using map_t = std::map<std::size_t, std::size_t>;
+        map_t global_params;   // param idx -> input idx
+        map_t global_results;  // result idx -> output idx
+    };
+    std::vector<GlobalIO> m_subrequests_gio;
+
+    // Tracks tensors we allocated on our own - to recognize and avoid copies
+    std::unordered_set<void*> m_input_allocated;
+
+    // Common functionality - shared for subclasses
     const std::size_t m_num_submodels;
 
+    TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
+    TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
+    virtual void alloc_io();
+    virtual TensorPtr alloc_global_out(std::size_t out_idx);
+
+    virtual void init_gio();
+    void unpack_closure(std::size_t idx, RqPtr request);
+    virtual void bind_global_params(std::size_t idx, RqPtr request);
+    virtual void bind_global_results(std::size_t idx, RqPtr request);
+
     void dump_input_tensors(std::size_t idx);
     void dump_output_tensors(std::size_t idx);
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index b52dd40ea59364..8770dee0d68eea 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -21,6 +21,7 @@
 #include "openvino/util/common_util.hpp"
 #include "partitioning/patterns/opt.hpp"
 #include "plugin.hpp"
+#include "unfold_sync_infer_request.hpp"
 #include "util.hpp"
 
 // required for get_properties_per_device()
@@ -442,9 +443,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
     }
 
     implement_properties();
-
-    m_finalized = true;
-    reset_io();
+    report_io();
 }
 
 void ov::npuw::CompiledModel::finalize_weights_bank() {
@@ -570,19 +569,7 @@ void ov::npuw::CompiledModel::fill_empty_tensor_names(const std::shared_ptr<ov::
     }
 }
 
-void ov::npuw::CompiledModel::reset_io() {
-    // Restore inputs/outputs from compiled submodels
-    // FIXME: this method is also called from IBaseInferReqeust::create_infer_request
-    // which is called in the CompiledModel(). So this method executes even before it
-    // is called for the right thing from the ctor directly.
-    if (!m_finalized)
-        return;  // avoid getting called before it is really the time
-
-    // Don't be like HETERO here - don't override the inputs/outputs(),
-    // as the ICompiledModel already creates one for us.
-    // Instead, remember the mapping from the original CompiledModel::input/output ports
-    // to the Subgraph's input/output ports.
-
+void ov::npuw::CompiledModel::report_io() const {
     LOG_VERB("*** Partition graph ***");
     int idx_in = 0, idx_out = 0;  // FIXME: use indexed()
     for (const auto& to_submodel : m_inputs_to_submodels_inputs) {
@@ -708,16 +695,41 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de
     }
 }
 
-std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
-    auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
-    return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
-}
-
 std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {
     // Synchronous infer request implementation may vary based on the
     // selected strategy
     auto* non_const_this = const_cast<ov::npuw::CompiledModel*>(this);  // because of const in API
-    return non_const_this->create_just_sync_infer_request();
+    auto non_const_this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(non_const_this->shared_from_this());
+
+    auto no_spatial_unpack = [&]() {
+        const auto num_submodels = m_compiled_submodels.size();
+        for (std::size_t idx = 0u; idx < num_submodels; idx++) {
+            const auto& comp_model_desc = m_compiled_submodels[idx];
+            if (!comp_model_desc.replaced_by.has_value()) {
+                // not a funcall, do nothing
+                continue;
+            }
+            const auto real_idx = comp_model_desc.replaced_by.value();
+            if (m_compiled_submodels[real_idx].spatial) {
+                LOG_WARN("Subgraph[" << idx << "] is a call to spatial function, unfold can't be done");
+                return false;  // Spatial graph
+            }
+            if (unpack_required(idx)) {
+                LOG_WARN("Subgraph[" << idx << "] requires unpack, unfold can't be done");
+                return false;  // Unpack required
+            }
+        }
+        return true;  // no spatial & subgraphs requiring unpack found
+    };
+
+    std::shared_ptr<ov::ISyncInferRequest> result;
+    if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>() && no_spatial_unpack()) {
+        result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr));
+    } else {
+        result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr));
+    }
+    NPUW_ASSERT(result);
+    return result;
 }
 
 std::shared_ptr<ov::IAsyncInferRequest> ov::npuw::CompiledModel::create_infer_request() const {
@@ -776,6 +788,46 @@ std::string ov::npuw::CompiledModel::submodel_device(const std::size_t idx) cons
     return *comp_subm_desc.device_it;
 }
 
+bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx) const {
+    auto& comp_model_desc = m_compiled_submodels.at(idx);
+    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
+        if (unpack_required(idx, cidx)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx, const std::size_t cidx) const {
+    if (is_gather_closure(idx, cidx)) {
+        return false;
+    }
+
+    auto& comp_model_desc = m_compiled_submodels.at(idx);
+    const auto real_idx = comp_model_desc.replaced_by.value();
+    auto& func_desc = m_compiled_submodels.at(real_idx);
+
+    auto& closure = comp_model_desc.closure.at(cidx);
+    const auto closure_param_id = comp_model_desc.param_base + cidx;
+
+    auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+    return (closure.get_element_type() != iport.get_element_type());
+}
+
+bool ov::npuw::CompiledModel::is_gather_closure(const std::size_t idx, const std::size_t cidx) const {
+    auto& comp_model_desc = m_compiled_submodels.at(idx);
+    const auto real_idx = comp_model_desc.replaced_by.value();
+    auto& func_desc = m_compiled_submodels.at(real_idx);
+
+    const auto closure_param_id = comp_model_desc.param_base + cidx;
+
+    if (func_desc.host_gather.dst_idx != -1 &&
+        static_cast<uint64_t>(func_desc.host_gather.dst_idx) == closure_param_id) {
+        return true;
+    }
+    return false;
+}
+
 void ov::npuw::CompiledModel::log_device_dist() const {
     std::unordered_map<std::string, execution_stats> stats_for_devices;
     execution_stats stats_for_optimized_out{0.f, 0ul};
@@ -934,6 +986,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
                           BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
                           BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
+                          BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
                           BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
                           BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
                           BIND(npuw::cache_dir, NPUW_CACHE_DIR),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index 6199ac66c0c64e..ece1bc78fb5bf5 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel {
     // FIXME: This class has many friends..
     friend class IBaseInferRequest;
     friend class JustInferRequest;
+    friend class UnfoldInferRequest;
     friend class MemAccessSim;
     friend class FuncMemMgr;
 
@@ -57,28 +58,27 @@ class CompiledModel : public ov::ICompiledModel {
 
     void dump_on_fail(std::size_t id, const std::string& device_to_stry, const char* extra);
 
-    bool m_finalized = false;
-    void reset_io();
+    void report_io() const;
 
     // This is used for removing too long output tensor names to fix some compilation issues
+    // NB: These two methods has nothing to do with this particular class and should be
+    // moved elsewhere
     void remove_long_output_names(const std::shared_ptr<ov::Model>& model);
     void fill_empty_tensor_names(const std::shared_ptr<ov::Model>& model);
 
     std::shared_ptr<const ::intel_npu::Plugin> get_npuw_plugin() const;
-
-    std::shared_ptr<ov::ISyncInferRequest> create_just_sync_infer_request();
     std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
 
     std::string submodel_device(const std::size_t idx) const;
+    bool is_gather_closure(const std::size_t idx, const std::size_t cidx) const;
+    bool unpack_required(const std::size_t idx) const;
+    bool unpack_required(const std::size_t idx, const std::size_t cidx) const;
 
     void log_device_dist() const;
-
     void implement_properties();
 
     void finalize_weights_bank();
-
     std::string global_mem_device() const;
-
     std::string funcall_mem_device(const std::size_t idx) const;
 
     std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
index 0e0b96582a663c..8d1c7c4a30acde 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -310,69 +310,9 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         }
     }  // if(function_pipelining)
 
-    // Preallocate input tensors
-    LOG_INFO("Preallocating input tensors...");
-    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
-        const auto& port = m_npuw_model->inputs()[i];
-        ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
-        m_input_tensors.push_back(allocated);
-        m_input_allocated.insert(allocated->data());
-        m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
-    }  // for(inputs)
-
-    // Preallocate output tensors
-    LOG_INFO("Preallocating output tensors...");
-    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
-        LOG_BLOCK();
-        const auto& port = m_npuw_model->outputs()[i];
-        LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);
-
-        // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
-        const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
-
-        LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
-        auto funcall_result_iter = m_funcall_result.find(from_submodel);
-
-        const auto& tensor =
-            funcall_result_iter != m_funcall_result.end()
-                ? funcall_result_iter->second  // Function calls have their tensors allocated, so just use one
-                : allocOut(port, m_npuw_model->global_mem_device());
-
-        m_output_tensors.push_back(tensor);
-        m_port_to_tensor[port] = TensorStorage{tensor, true};
-    }
+    alloc_io();
     connect_subrequests();
-
-    // Build the parameter/result mapping {{{
-    m_subrequests_gio.resize(m_subrequests.size());
-
-    // Parameters: stage 1...
-    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
-        const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
-        if (to_submodel != CompiledModel::NO_LINK) {
-            std::size_t sub_idx{}, in_idx{};
-            std::tie(sub_idx, in_idx) = to_submodel;
-            m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
-        }
-    }  // for(inputs)
-
-    // Parameters: stage 2...
-    for (auto&& it : m_npuw_model->m_param_subscribers) {
-        const auto param_idx = it.first;
-        for (auto&& to_submodel : it.second) {
-            std::size_t sub_idx{}, in_idx{};
-            std::tie(sub_idx, in_idx) = to_submodel;
-            m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx;
-        }
-    }
-
-    // Results
-    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
-        std::size_t sub_idx{}, out_idx{};
-        std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
-        m_subrequests_gio.at(sub_idx).global_results[i] = out_idx;
-    }
-    // }}}
+    init_gio();
 
     for (size_t i = 0; i < m_num_submodels; i++) {
         LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]...");
@@ -413,6 +353,15 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
     }
 }
 
+ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) {
+    const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(out_idx);
+    auto funcall_result_iter = m_funcall_result.find(from_submodel);
+    if (funcall_result_iter != m_funcall_result.end()) {
+        return funcall_result_iter->second;
+    }
+    return IBaseInferRequest::alloc_global_out(out_idx);
+}
+
 void ov::npuw::JustInferRequest::connect_subrequests() {
     LOG_INFO("Connecting subrequests...");
     LOG_BLOCK();
@@ -478,33 +427,6 @@ void ov::npuw::JustInferRequest::connect_subrequests() {
     LOG_INFO("Done");
 }
 
-std::vector<ov::SoPtr<ov::IVariableState>> ov::npuw::JustInferRequest::query_state() const {
-    std::vector<ov::SoPtr<ov::IVariableState>> variable_states = {};
-    for (const auto& request : m_subrequests) {
-        if (!request)  // optimized out
-            continue;
-        for (auto&& state : request->query_state()) {
-            if (!state._so)
-                state._so = request._so;
-            variable_states.emplace_back(state);
-        }
-    }
-    return variable_states;
-}
-
-std::vector<ov::ProfilingInfo> ov::npuw::JustInferRequest::get_profiling_info() const {
-    std::vector<ov::ProfilingInfo> info;
-    for (size_t i = 0; i < m_subrequests.size(); ++i) {
-        if (!m_subrequests[i])  // optimized out
-            continue;
-        auto&& subreq_info = m_subrequests[i]->get_profiling_info();
-        for (auto&& rec : subreq_info)
-            rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name;
-        info.insert(info.end(), subreq_info.begin(), subreq_info.end());
-    }
-    return info;
-}
-
 void ov::npuw::JustInferRequest::prepare_for_infer() {
     LOG_DEBUG("Preparing to infer...");
     LOG_BLOCK();
@@ -542,118 +464,36 @@ void ov::npuw::JustInferRequest::start_subrequest(std::size_t idx) {
 }
 
 void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
-    LOG_DEBUG("Binding parameters for Subgraph[" << idx << "]");
-    LOG_BLOCK();
-
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
 
-    const bool do_copy = needs_copy(idx);
-    const auto& iodesc = m_subrequests_gio.at(idx);
-
-    const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
-    const bool is_spatial = proto_comp_model_desc.spatial.has_value();
-
-    // a list of ports to copy tensors, if needed: FROM -> TO
-    std::vector<std::pair<ov::SoPtr<ov::ITensor>, ov::Output<const ov::Node>>> copy_list;
-
     // pick which subrequest we actually work on here
-    auto subr = [&]() {
-        if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) {
-            LOG_DEBUG("Accessing the pipeline subrequest");
-            // The real index of request we need to prepare IS
-            // the same request which executes now AND
-            // function_pipelining enabled - select the reserve request.
-            NPUW_ASSERT(m_funcall_pipeline[real_idx].subrequest);
-            return m_funcall_pipeline[real_idx].subrequest;
-        }
+    if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) {
+        LOG_DEBUG("Accessing the pipeline subrequest");
+        // The real index of request we need to prepare IS
+        // the same request which executes now AND
+        // function_pipelining enabled - select the reserve request.
+        NPUW_ASSERT(m_funcall_pipeline[real_idx].subrequest);
+        bind_global_params(idx, m_funcall_pipeline[real_idx].subrequest);
+    } else {
         // Otherwise: Just a return a subrequest which is in place.
         // If it is a function call and we have function pipelining ON,
         // it is still the right subrequest we can use.
         LOG_DEBUG("Accessing the primary subrequest");
-        return m_subrequests[real_idx];
-    }();
-
-    // Check if the given subgraph's input is spatial
-    auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool {
-        if (!is_spatial) {
-            return false;  // Early return
-        }
-        auto& spatial = proto_comp_model_desc.spatial.value();
-        return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool {
-            return p.idx == sub_in_idx;
-        });
-    };
-
-    for (auto&& it : iodesc.global_params) {
-        std::size_t param_idx{}, sub_in_idx{};
-        std::tie(param_idx, sub_in_idx) = it;
-        LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl);
-
-        const auto& g_port = m_npuw_model->inputs()[param_idx];
-        const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
-        const auto& s_port = subr->get_inputs()[sub_in_idx];
-        LOG_DEBUG("Processing " << g_port << " -> " << s_port << "...");
-        LOG_BLOCK();
-        if (!is_spatial_param(sub_in_idx)) {
-            // Input parameter is non-spatial, do normal handling
-            if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) {
-                LOG_DEBUG("Will be copied");
-                copy_list.emplace_back(g_tnsr, s_port);
-            } else {
-                LOG_DEBUG("Will be set");
-                subr->set_tensor(s_port, g_tnsr);
-            }
-        } else {
-            // Register for future use
-            m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr;
-        }
+        bind_global_params(idx, m_subrequests[real_idx]);
     }
-
-    LOG_DEBUG("Running copy...");
-    ov::parallel_for(copy_list.size(), [&](std::size_t idx) {
-        auto& it = copy_list[idx];
-        ov::SoPtr<ov::ITensor> dst = subr->get_tensor(it.second);
-        it.first->copy_to(dst._ptr);
-    });
-
-    // Run host-side gather, if required
-    if (comp_model_desc.host_gather.dst_idx != -1) {
-        const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx];
-        const auto gather = subr->get_tensor(gport);
-
-        const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
-        const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
-        const auto lookup = subr->get_tensor(lport);
-        ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather);
-    }
-
-    LOG_DEBUG("Done");
 }
 
 void ov::npuw::JustInferRequest::bind_global_results(std::size_t idx) {
-    LOG_DEBUG("Binding results for Subgraph[" << idx << "]");
-    LOG_BLOCK();
-
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     if (comp_model_desc.replaced_by) {
         // Don't do here - function call will take the right tensor
         // itself. Note it may be implemented more efficently than now
         // (and in some cases, the tensor can be pre-set)
-        LOG_DEBUG("Skipping this too now - function will do it for itself");
+        LOG_DEBUG("Skipping bind_glo - function will do it for itself");
         return;
     }
-
-    const auto& iodesc = m_subrequests_gio.at(idx);
-    for (auto&& it : iodesc.global_results) {
-        std::size_t result_idx{}, sub_out_idx{};
-        std::tie(result_idx, sub_out_idx) = it;
-        const auto& g_port = m_npuw_model->outputs()[result_idx];
-        const auto& s_port = m_subrequests[idx]->get_outputs()[sub_out_idx];
-        m_subrequests[idx]->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
-    }
-
-    LOG_DEBUG("Done");
+    IBaseInferRequest::bind_global_results(idx, m_subrequests[idx]);
 }
 
 void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
@@ -737,88 +577,6 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
     LOG_DEBUG("Done");
 }
 
-void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) {
-    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
-
-    NPUW_ASSERT(comp_model_desc.replaced_by);
-    const auto real_idx = comp_model_desc.replaced_by.value();
-    auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx];
-
-    // Bind extra parameters from the function's closure
-    // First, do easy things & delay heavy stuff
-    std::vector<std::size_t> closure_unpack_required;
-    std::vector<std::size_t> closure_copy_required;
-
-    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
-        auto& closure = comp_model_desc.closure[cidx];
-
-        const auto closure_param_id = comp_model_desc.param_base + cidx;
-
-        if (func_desc.host_gather.dst_idx != -1 &&
-            static_cast<uint64_t>(func_desc.host_gather.dst_idx) == closure_param_id) {
-            // No need to set/copy the host_gather's closure tensor int
-            // the subrequest - it is just a dummy. host_gather writes
-            // to the right buffer directly.
-            continue;
-        }
-
-        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
-        if (closure.get_element_type() != iport.get_element_type()) {
-            // Remember where the unpack is required
-            closure_unpack_required.push_back(cidx);
-        } else {
-            if (needs_copy(idx, cidx)) {
-                // Remember where copy is requried
-                closure_copy_required.push_back(cidx);
-            } else {
-                // Easy case, just set one to another
-                request->set_tensor(iport, ov::get_tensor_impl(closure));
-            }
-        }
-    }  // for(closure)
-
-    // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
-    ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
-        auto cidx = closure_copy_required[j];
-        auto& closure = comp_model_desc.closure[cidx];
-        const auto closure_param_id = comp_model_desc.param_base + cidx;
-        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
-        auto clparam = request->get_tensor(iport);
-        ov::get_tensor_impl(closure)->copy_to(clparam._ptr);
-    });
-    // }); // ms_to_run
-
-    for (std::size_t j = 0; j != closure_unpack_required.size(); j++) {
-        // NB: No need to protect anything here as containers are all
-        // preallocated and we only access elements under particular (thread
-        // -local) indices.
-        auto cidx = closure_unpack_required[j];
-
-        // FIXME: zerops are stored with absolute indexing, this needs to be aligned
-        auto& closure = comp_model_desc.closure[cidx];
-
-        const auto closure_param_id = comp_model_desc.param_base + cidx;
-        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
-        auto clparam = request->get_tensor(iport);
-
-        if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) {
-            // Unpacking this weight requires scaling with zero points...
-            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
-                                   ov::get_tensor_impl(comp_model_desc.zerops[cidx]),
-                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
-                                   clparam);
-        } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) {
-            // Unpacking this weight requires scaling
-            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
-                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
-                                   clparam);
-        } else {
-            // Unpacking this weight doesn't require scaling
-            ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam);
-        }
-    }
-}
-
 void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     auto real_idx = comp_model_desc.replaced_by.value_or(idx);
@@ -1110,24 +868,6 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
     }  // if (replaced_by)
 }
 
-ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type,
-                                                         const ov::Shape& shape,
-                                                         const std::string& device) {
-    if (device == "CPU" || ov::shape_size(shape) == 0) {
-        return ov::get_tensor_impl(ov::Tensor(type, shape));
-    }
-
-    // Protect access to shared context(s) - at least among infer requests
-    auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
-    auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
-    return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
-}
-
-ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output<const ov::Node>& node,
-                                                         const std::string& device) {
-    return allocMem(node.get_element_type(), node.get_shape(), device);
-}
-
 void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) {
     get_real_subrequest(idx)->set_callback(std::move(cb));
 }
@@ -1140,10 +880,6 @@ void ov::npuw::JustInferRequest::cancel_subrequest(std::size_t idx) {
     m_subrequests[idx]->cancel();
 }
 
-std::size_t ov::npuw::JustInferRequest::total_subrequests() const {
-    return m_subrequests.size();
-}
-
 bool ov::npuw::JustInferRequest::supports_async_pipeline() const {
     return false;
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
index d219f170a8e6bb..a935220b4b8943 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -23,13 +23,6 @@ namespace npuw {
 class CompiledModel;
 class AsyncInferRequest;
 
-using LinkFrom = std::pair<std::size_t /* Subrequest index */
-                           ,
-                           std::size_t /* Subrequest output index */
-                           >;          // FIXME: This is a third, if not fourth, definitiion of such structure
-
-using TensorPtr = ov::SoPtr<ov::ITensor>;
-
 class MemAccessSim {
 public:
     explicit MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
@@ -77,11 +70,7 @@ class JustInferRequest final : public IBaseInferRequest {
 public:
     explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
 
-    // Query APIs
-    std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
-    std::vector<ov::ProfilingInfo> get_profiling_info() const override;
-
-private:
+protected:
     ////////////////////////////////////
     // implement IBaseInferRequest
     void prepare_for_infer() override;
@@ -91,11 +80,11 @@ class JustInferRequest final : public IBaseInferRequest {
     void subscribe_subrequest(std::size_t idx, Completed cb) override;
     void complete_subrequest(std::size_t idx) override;
     void cancel_subrequest(std::size_t idx) override;
-    std::size_t total_subrequests() const override;
     bool supports_async_pipeline() const override;
-
     void update_subrequest_links(std::size_t idx) override;
 
+    TensorPtr alloc_global_out(std::size_t out_idx) override;
+
     ////////////////////////////////////
     // now own API
 
@@ -104,9 +93,9 @@ class JustInferRequest final : public IBaseInferRequest {
 
     void bind_global_parameters(std::size_t idx);
     void bind_global_results(std::size_t idx);
+    using IBaseInferRequest::bind_global_results;
 
     void function_prologue(std::size_t idx);
-    void unpack_closure(std::size_t idx, RqPtr request);
 
     void unsafe_during(std::size_t real_idx, const std::function<void()>& f);
     void unsafe_infer(std::size_t real_idx);
@@ -115,9 +104,6 @@ class JustInferRequest final : public IBaseInferRequest {
     void connect_subrequests();
     void recreate_subrequests(std::size_t idx);
 
-    TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
-    TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
-
     FuncMemMgr m_func_mem_mgr;                       // Owns memory
     std::map<LinkFrom, TensorPtr> m_funcall_result;  // Provides a convenient link
 
@@ -139,18 +125,6 @@ class JustInferRequest final : public IBaseInferRequest {
     // initialized.
     std::vector<FuncallPipeline> m_funcall_pipeline;
 
-    // This structure tracks how every individual subrequest
-    // access the model's top-level (global, public, etc) parameters
-    // and results
-    struct GlobalIO {
-        using map_t = std::map<std::size_t, std::size_t>;
-        map_t global_params;   // param idx -> input idx
-        map_t global_results;  // result idx -> output idx
-    };
-    std::vector<GlobalIO> m_subrequests_gio;
-
-    std::unordered_set<void*> m_input_allocated;
-
     // Represents spatial run-time info
     runtime::spatial::Selector::Ptr m_spatial_selector;
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
new file mode 100644
index 00000000000000..90eb62dcc0a8e3
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
@@ -0,0 +1,140 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "unfold_sync_infer_request.hpp"
+
+#include "compiled_model.hpp"
+#include "logging.hpp"
+#include "openvino/core/parallel.hpp"
+
+ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
+    : ov::npuw::IBaseInferRequest(compiled_model) {
+    // Create infer requests
+    // Preallocate funcall tensors & substitute function call requests
+    for (std::size_t i = 0; i < m_num_submodels; i++) {
+        LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
+        LOG_BLOCK();
+        auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
+
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            // no model & no funcall - optimized out, do nothing
+            LOG_INFO("OPTIMIZED OUT");
+            continue;
+        }
+
+        if (comp_model_desc.replaced_by) {
+            // Pre-allocate output tensors for this function call
+            const auto real_idx = comp_model_desc.replaced_by.value();
+            auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+            if (proto_comp_model_desc.spatial) {
+                NPUW_ASSERT(false && "Spatial is not supported in unfold");
+            }
+        }  // if(replaced_by)
+
+        const auto real_idx = comp_model_desc.replaced_by.value_or(i);
+        auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+        // NB: UnfoldInferRequest is _NOT_ fail-safe! Fail means fail here
+        m_subrequests[i] = proto_comp_model_desc.compiled_model->create_infer_request();
+        m_subrequest_devices[i] = *proto_comp_model_desc.device_it;
+        LOG_INFO("DONE");
+    }  // for(submodels)
+
+    alloc_io();
+
+    LOG_INFO("Connecting subrequests...");
+    LOG_BLOCK();
+    for (const auto& kvp : m_npuw_model->m_submodels_input_to_prev_output) {
+        const auto& subm_idx_to = kvp.first.first;
+        const auto& port_idx_to = kvp.first.second;
+        const auto& subm_idx_from = kvp.second.first;
+        const auto& port_idx_from = kvp.second.second;
+
+        LOG_DEBUG("Subgraph[" << subm_idx_from << "]/" << port_idx_from << " --> "
+                              << "Subgraph[" << subm_idx_to << "]/" << port_idx_to);
+        NPUW_ASSERT(m_subrequests[subm_idx_from]);  // prod request is created
+        NPUW_ASSERT(m_subrequests[subm_idx_to]);    // cons request is created
+        NPUW_ASSERT(m_subrequests[subm_idx_from]._ptr != m_subrequests[subm_idx_to]._ptr);
+
+        const auto& iport = m_subrequests[subm_idx_to]->get_compiled_model()->inputs()[port_idx_to];
+        const auto& oport = m_subrequests[subm_idx_from]->get_compiled_model()->outputs()[port_idx_from];
+        const auto& tensor = m_subrequests[subm_idx_from]->get_tensor(oport);
+        LOG_DEBUG("Set Subgraph[" << subm_idx_to << "]/" << iport << " to Subgraph[" << subm_idx_from << "]/" << oport);
+        m_subrequests[subm_idx_to]->set_tensor(iport, tensor);
+    }  // for(map)
+    LOG_INFO("Done");
+
+    init_gio();
+
+    for (size_t i = 0; i < m_num_submodels; i++) {
+        LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]...");
+        LOG_BLOCK();
+        auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            continue;  // Optimized out
+        }
+        unpack_closure(i, m_subrequests[i]);
+        LOG_VERB("Done");
+    }
+}
+
+bool ov::npuw::UnfoldInferRequest::valid_subrequest(std::size_t idx) const {
+    return m_subrequests.at(idx) != nullptr;
+}
+
+void ov::npuw::UnfoldInferRequest::infer() {
+    const bool do_async = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
+
+    auto prepare = [&](std::size_t idx) {
+        if (idx >= m_subrequests.size()) {
+            return;
+        }
+        bind_global_params(idx, m_subrequests[idx]);
+        bind_global_results(idx, m_subrequests[idx]);
+    };
+    auto wait_and_clear = [](RqPtrs& rqs) {
+        for (auto&& r : rqs) {
+            r->wait();
+        }
+        rqs.clear();
+    };
+
+    if (do_async) {
+        std::size_t past_repl_id = 0u;
+        RqPtrs previous_requests;
+
+        prepare(0);
+        for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
+            auto& subr = m_subrequests[idx];
+            if (!subr) {
+                prepare(idx + 1);
+                continue;
+            }
+            auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+            const auto this_repl_id = comp_model_desc.replaced_by.value_or(idx);
+            if (this_repl_id != past_repl_id) {
+                // For non-repeating blocks, the above value_or returns idx
+                // For repeating blocks, it returns the function group id
+                // If either is not equal to the past_repl_id, make a barrier here
+                wait_and_clear(previous_requests);
+                past_repl_id = this_repl_id;
+            }
+            subr->start_async();
+            previous_requests.push_back(subr);
+            prepare(idx + 1);
+        }
+        wait_and_clear(previous_requests);
+    } else {
+        prepare(0);
+        for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
+            auto& subr = m_subrequests[idx];
+            if (!subr) {
+                prepare(idx + 1);
+                continue;
+            }
+            subr->start_async();
+            prepare(idx + 1);
+            subr->wait();
+        }
+    }  // (async)
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp
new file mode 100644
index 00000000000000..76b67571ec4c40
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <limits>
+#include <map>
+#include <mutex>
+#include <optional>
+#include <vector>
+
+#include "base_sync_infer_request.hpp"
+
+namespace ov {
+namespace npuw {
+
+class UnfoldInferRequest final : public IBaseInferRequest {
+public:
+    explicit UnfoldInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
+
+    ////////////////////////////////////
+    // implement IBaseInferRequest - nether of these are required here
+    // this hierarchy needs revew
+    void prepare_for_infer() override {}
+    bool valid_subrequest(std::size_t idx) const override;
+    void start_subrequest(std::size_t) override {}
+    void run_subrequest_for_success(std::size_t, bool&) override {}
+    void subscribe_subrequest(std::size_t, Completed cb) override {}
+    void complete_subrequest(std::size_t) override {}
+    void cancel_subrequest(std::size_t) override {}
+    bool supports_async_pipeline() const override {
+        return false;
+    }
+    void update_subrequest_links(std::size_t) override {}
+
+private:
+    void infer() override;
+};
+
+}  // namespace npuw
+}  // namespace ov

From 76cca6cebfdb9efe21f672960768483c6d0a4000 Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Tue, 26 Nov 2024 20:22:27 +0000
Subject: [PATCH 2/3] [NPUW] LazyTensor refactoring (#27108)

---
 .../intel_npu/src/plugin/npuw/lazy_tensor.cpp | 398 ++++++++----------
 .../intel_npu/src/plugin/npuw/lazy_tensor.hpp |  32 +-
 .../plugin/npuw/partitioning/partitioning.cpp |  27 +-
 .../npuw/partitioning/patterns/dcoff.cpp      |   8 +-
 4 files changed, 208 insertions(+), 257 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp
index 8a0317a9f714e8..81521222ae6fae 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp
@@ -4,41 +4,166 @@
 
 #include "lazy_tensor.hpp"
 
-using ov::npuw::weights::ConcatMeta;
-using ov::npuw::weights::ConstPtr;
+#include <tuple>
+#include <type_traits>
+#include <variant>
+
+#include "logging.hpp"
+#include "openvino/runtime/make_tensor.hpp"
+#include "util.hpp"
+
 using ov::npuw::weights::LazyTensor;
-using ov::npuw::weights::OrigData;
-using ov::npuw::weights::Transform;
-using ov::npuw::weights::TransformType;
-using ov::npuw::weights::UnpackMeta;
 
 namespace ov {
 namespace npuw {
 namespace weights {
+namespace op {
+struct Const {
+    std::shared_ptr<ov::op::v0::Constant> node;
+
+    std::size_t hash() const {
+        std::size_t seed = std::hash<const void*>()(node->get_data_ptr()) + 0x9e3779b9;
+        seed ^= node->get_element_type().hash() + 0x9e3779b9;
+        for (const auto& dim : node->get_shape()) {
+            seed ^= std::hash<std::size_t>()(dim) + 0x9e3779b9;
+        }
+        return seed;
+    }
+    bool operator==(const Const& other) const {
+        return (node->get_shape() == other.node->get_shape() &&
+                node->get_element_type() == other.node->get_element_type() &&
+                node->get_data_ptr() == other.node->get_data_ptr());
+    }
+    ov::Tensor eval() const {
+        return ov::npuw::util::tensor_from_const(node);
+    }
+};
+struct Concat {
+    std::vector<LazyTensor> tensors;
+    std::size_t axis;
+
+    std::size_t hash() const {
+        std::size_t seed = std::hash<std::size_t>()(axis) + 0x9e3779b9;
+        for (auto& lt : tensors) {
+            seed ^= lt.get_hash() + 0x9e3779b9;
+        }
+        return seed;
+    }
+    bool operator==(const Concat& other) const {
+        return (axis == other.axis && tensors == other.tensors);
+    }
+    ov::Tensor eval() const {
+        std::vector<ov::Tensor> to_concat;
+        for (const auto& lt : tensors) {
+            to_concat.push_back(lt.eval());
+        }
+        return ov::npuw::util::concat(to_concat, axis);
+    }
+};
+
+struct Unpack {
+    LazyTensor w, z, s;
+    ov::element::Type type;
+    ov::Shape shape;
+
+    std::size_t hash() const {
+        std::size_t seed = w.get_hash() + 0x9e3779b9;
+        seed ^= z.get_hash() + 0x9e3779b9;
+        seed ^= s.get_hash() + 0x9e3779b9;
+        seed ^= type.hash() + 0x9e3779b9;
+        for (const auto& dim : shape) {
+            seed ^= std::hash<std::size_t>()(dim) + 0x9e3779b9;
+        }
+        return seed;
+    }
+    bool operator==(const Unpack& other) const {
+        return (type == other.type && shape == other.shape && w == other.w && z == other.z && s == other.s);
+    }
+    ov::Tensor eval() const {
+        const auto& gti = ov::get_tensor_impl;
+        const auto& tw = w.eval();
+        const auto& tz = z.eval();
+        const auto& ts = s.eval();
+        NPUW_ASSERT(tw);
+        ov::Tensor dst(type, shape);
+        if (tw && tz && ts) {
+            ov::npuw::util::unpack(gti(tw), gti(tz), gti(ts), gti(dst));
+        } else if (tw && ts) {
+            ov::npuw::util::unpack(gti(tw), gti(ts), gti(dst));
+        } else {
+            NPUW_ASSERT(false && "Unsupported combination");
+        }
+        return dst;
+    }
+};
+struct Permute {
+    LazyTensor tensor;
+    std::vector<std::size_t> axes;
+
+    std::size_t hash() const {
+        std::size_t seed = tensor.get_hash() + 0x9e3779b9;
+        for (const auto& axis : axes) {
+            seed ^= std::hash<std::size_t>()(axis) + 0x9e3779b9;
+        }
+        return seed;
+    }
+    bool operator==(const Permute& other) const {
+        return (axes == other.axes && tensor == other.tensor);
+    }
+    ov::Tensor eval() const {
+        return ov::npuw::util::permute(tensor.eval(), axes);
+    }
+};
+struct Convert {
+    LazyTensor tensor;
+    ov::element::Type type;
+
+    std::size_t hash() const {
+        std::size_t seed = type.hash() + 0x9e3779b9;
+        seed ^= tensor.get_hash() + 0x9e3779b9;
+        return seed;
+    }
+    bool operator==(const Convert& other) const {
+        return (type == other.type && tensor == other.tensor);
+    }
+    ov::Tensor eval() const {
+        NPUW_ASSERT(ov::element::f16 == type);
+        return ov::npuw::util::to_f16(tensor.eval());
+    }
+};
+}  // namespace op
+
+using Transform = std::variant<op::Const, op::Concat, op::Unpack, op::Permute, op::Convert>;
 
 struct LazyTensorImpl {
 public:
     LazyTensorImpl() = default;
-    LazyTensorImpl(const TransformType& type, const Transform& transform);
-
-    bool operator==(const LazyTensorImpl& other) const;
+    explicit LazyTensorImpl(Transform&& t);
 
     ov::Tensor eval() const;
 
-    ov::Tensor get_orig_tensor() const;
-
+    bool operator==(const LazyTensorImpl& other) const;
     std::size_t get_hash() const;
 
-    bool has_transformations() const;
-
-    std::shared_ptr<LazyTensorImpl> m_parent = nullptr;
-    std::pair<TransformType, Transform> m_transform;
+    Transform m_transform;
     std::size_t m_hash = 0;
+};
+
+}  // namespace weights
+}  // namespace npuw
+}  // namespace ov
+
+using namespace ov::npuw::weights::op;
+using ov::npuw::weights::LazyTensorImpl;
+using ov::npuw::weights::Transform;
 
-    void* m_orig_data = nullptr;
-    ov::Shape m_orig_shape;
-    ov::element::Type m_orig_type;
+// std::visit helper
+template <class... Ts>
+struct overloaded : Ts... {
+    using Ts::operator()...;
 };
+template <class... Ts>
+overloaded(Ts...) -> overloaded<Ts...>;
 
 std::size_t LazyTensorImpl::get_hash() const {
     // Already calculated
@@ -46,120 +171,23 @@ std::size_t LazyTensorImpl::get_hash() const {
         return m_hash;
     }
 
-    // Get parent's hash
+    // Get hash
     std::size_t seed = 0;
-    if (m_parent) {
-        seed = m_parent->get_hash();
-    } else {
-        seed = std::hash<void*>()(m_orig_data) + 0x9e3779b9;
-        for (const auto& dim : m_orig_shape) {
-            seed ^= std::hash<std::size_t>()(dim) + 0x9e3779b9;
-        }
-        seed ^= m_orig_type.hash() + 0x9e3779b9;
-    }
-
-    // Combine with this hash
-    seed ^= std::hash<int>()(static_cast<int>(m_transform.first)) + 0x9e3779b9;
-    if (m_transform.first == TransformType::PERMUTE) {
-        const auto& axes = std::get<std::vector<std::size_t>>(m_transform.second);
-        for (const auto& axis : axes) {
-            seed ^= std::hash<std::size_t>()(axis) + 0x9e3779b9;
-        }
-    } else if (m_transform.first == TransformType::CONCAT) {
-        const auto& axis = std::get<ConcatMeta>(m_transform.second).second;
-        seed ^= std::hash<std::size_t>()(axis) + 0x9e3779b9;
-        for (auto& lt : std::get<ConcatMeta>(m_transform.second).first) {
-            seed ^= lt.get_hash() + 0x9e3779b9;
-        }
-    } else if (m_transform.first == TransformType::UNPACK) {
-        const auto& unpack_meta = std::get<UnpackMeta>(m_transform.second);
-        seed ^= std::get<0>(unpack_meta).get_hash() + 0x9e3779b9;
-        seed ^= std::get<1>(unpack_meta).get_hash() + 0x9e3779b9;
-        seed ^= std::get<2>(unpack_meta).get_hash() + 0x9e3779b9;
-        for (const auto& dim : std::get<3>(unpack_meta)) {
-            seed ^= std::hash<std::size_t>()(dim) + 0x9e3779b9;
-        }
-        seed ^= std::get<4>(unpack_meta).hash() + 0x9e3779b9;
-    }
+    std::visit(overloaded{[&seed](const auto& op) {
+                   seed ^= op.hash();
+               }},
+               m_transform);
 
     return seed;
 }
-}  // namespace weights
-}  // namespace npuw
-}  // namespace ov
-
-using ov::npuw::weights::LazyTensorImpl;
-
-LazyTensorImpl::LazyTensorImpl(const TransformType& type, const Transform& transform) {
-    if (type == TransformType::THIS && std::holds_alternative<OrigData>(transform)) {
-        m_transform = std::make_pair(type, transform);
-        ov::Tensor tensor;
-        if (std::holds_alternative<ConstPtr>(std::get<OrigData>(transform))) {
-            tensor = ov::npuw::util::tensor_from_const(std::get<ConstPtr>(std::get<OrigData>(transform)));
-        } else {
-            tensor = std::get<ov::Tensor>(std::get<OrigData>(transform));
-            if (!tensor) {
-                // Don't set anything
-                return;
-            }
-        }
-        m_orig_data = tensor.data();
-        m_orig_shape = tensor.get_shape();
-        m_orig_type = tensor.get_element_type();
-    } else if (type == TransformType::CONCAT && std::holds_alternative<ConcatMeta>(transform)) {
-        m_transform = std::make_pair(type, transform);
-    } else if (type == TransformType::UNPACK && std::holds_alternative<UnpackMeta>(transform)) {
-        m_transform = std::make_pair(type, transform);
-    } else {
-        NPUW_ASSERT(false);
-    }
 
+LazyTensorImpl::LazyTensorImpl(Transform&& t) {
+    m_transform = std::move(t);
     m_hash = get_hash();
 }
 
 bool LazyTensorImpl::operator==(const LazyTensorImpl& other) const {
-    if (m_hash != other.m_hash || m_orig_data != other.m_orig_data || m_orig_shape != other.m_orig_shape ||
-        m_orig_type != other.m_orig_type || m_transform.first != other.m_transform.first) {
-        return false;
-    }
-
-    switch (m_transform.first) {
-    case TransformType::THIS:
-        // everything is already compared above - skip
-        break;
-    case TransformType::CONVERT:
-        // everything is already compared above - skip
-        break;
-    case TransformType::PERMUTE:
-        if (std::get<std::vector<std::size_t>>(m_transform.second) !=
-            std::get<std::vector<std::size_t>>(other.m_transform.second)) {
-            return false;
-        }
-        break;
-    case TransformType::CONCAT:
-        if (std::get<ConcatMeta>(m_transform.second) != std::get<ConcatMeta>(other.m_transform.second)) {
-            return false;
-        }
-        break;
-    case TransformType::UNPACK:
-        if (std::get<UnpackMeta>(m_transform.second) != std::get<UnpackMeta>(other.m_transform.second)) {
-            return false;
-        }
-        break;
-    default:
-        NPUW_ASSERT(false);
-        break;
-    }
-
-    if ((m_parent && !other.m_parent) || (!m_parent && other.m_parent)) {
-        return false;
-    }
-
-    if (m_parent && other.m_parent) {
-        return *m_parent.get() == *other.m_parent.get();
-    }
-
-    return true;
+    return m_hash == other.m_hash && m_transform == other.m_transform;
 }
 
 ov::Tensor LazyTensorImpl::eval() const {
@@ -173,82 +201,37 @@ ov::Tensor LazyTensorImpl::eval() const {
     Perhaps it should be done after model compilation and not handled here.
     */
 
-    // Process the initial tensor - either from Const or from Concat
-    if (!m_parent) {
-        if (m_transform.first == TransformType::THIS) {
-            return get_orig_tensor();
-        } else if (m_transform.first == TransformType::CONCAT) {
-            std::vector<ov::Tensor> to_concat;
-            for (const auto& lt : std::get<ConcatMeta>(m_transform.second).first) {
-                // Sanity check
-                NPUW_ASSERT(!lt.has_transformations());
-                to_concat.push_back(lt.get_orig_tensor());
-            }
-            return ov::npuw::util::concat(to_concat, std::get<ConcatMeta>(m_transform.second).second);
-        } else if (m_transform.first == TransformType::UNPACK) {
-            const auto& unpack_meta = std::get<UnpackMeta>(m_transform.second);
-            const auto& cw = std::get<0>(unpack_meta);
-            const auto& cz = std::get<1>(unpack_meta);
-            const auto& cs = std::get<2>(unpack_meta);
-            const auto& shape = std::get<3>(unpack_meta);
-            const auto& type = std::get<4>(unpack_meta);
-
-            // Note: unpacking done in-place since the original tensor is empty at this point
-            NPUW_ASSERT(!cw.has_transformations());
-            NPUW_ASSERT(!cs.has_transformations());
-            // FIXME: Ugly check concat case as well since cz might be not set
-            if (cz.has_transformations()) {
-                NPUW_ASSERT(false);
-            }
-
-            const auto& gti = ov::get_tensor_impl;
-            const auto& tw = cw.get_orig_tensor();
-            const auto& tz = cz.get_orig_tensor();
-            const auto& ts = cs.get_orig_tensor();
-            ov::Tensor dst(type, shape);
-            if (tw && tz && ts) {
-                ov::npuw::util::unpack(gti(tw), gti(tz), gti(ts), gti(dst));
-            } else if (tw && ts) {
-                ov::npuw::util::unpack(gti(tw), gti(ts), gti(dst));
-            } else {
-                NPUW_ASSERT(false && "Unsupported combination");
-            }
-            return dst;
-        } else {
-            NPUW_ASSERT(false);
-        }
-    }
-
-    // Process transformation
-    switch (m_transform.first) {
-    case TransformType::PERMUTE:
-        return ov::npuw::util::permute(m_parent->eval(), std::get<std::vector<std::size_t>>(m_transform.second));
-    case TransformType::CONVERT:
-        return ov::npuw::util::to_f16(m_parent->eval());
-    default:
-        NPUW_ASSERT(false);
-    }
-
-    NPUW_ASSERT(false);
-    return ov::Tensor();
+    ov::Tensor result = std::visit(overloaded{[](const auto& op) {
+                                       return op.eval();
+                                   }},
+                                   m_transform);
+    NPUW_ASSERT(result);
+    return result;
 }
 
-ov::Tensor LazyTensorImpl::get_orig_tensor() const {
-    // Sanity check
-    NPUW_ASSERT(!has_transformations());
-    if (std::holds_alternative<ConstPtr>(std::get<OrigData>(m_transform.second))) {
-        return ov::npuw::util::tensor_from_const(std::get<ConstPtr>(std::get<OrigData>(m_transform.second)));
-    }
-    return std::get<ov::Tensor>(std::get<OrigData>(m_transform.second));
+LazyTensor::LazyTensor(const std::shared_ptr<ov::op::v0::Constant>& const_ptr)
+    : m_impl(std::make_shared<LazyTensorImpl>(op::Const{const_ptr})) {}
+LazyTensor::LazyTensor(const std::vector<LazyTensor>& to_concat, const std::size_t axis)
+    : m_impl(std::make_shared<LazyTensorImpl>(op::Concat{to_concat, axis})) {}
+LazyTensor::LazyTensor(const LazyTensor& cw,
+                       const LazyTensor& cz,
+                       const LazyTensor& cs,
+                       const ov::element::Type& type,
+                       const ov::Shape& shape)
+    : m_impl(std::make_shared<LazyTensorImpl>(op::Unpack{cw, cz, cs, type, shape})) {}
+
+LazyTensor LazyTensor::permute(const std::vector<std::size_t>& axes) {
+    LazyTensor new_lt;
+    new_lt.m_impl = std::make_shared<LazyTensorImpl>(op::Permute{*this, axes});
+    return new_lt;
 }
 
-bool LazyTensorImpl::has_transformations() const {
-    return m_transform.first != TransformType::THIS;
+LazyTensor LazyTensor::convert(const ov::element::Type& type) {
+    LazyTensor new_lt;
+    new_lt.m_impl = std::make_shared<LazyTensorImpl>(op::Convert{*this, type});
+    return new_lt;
 }
 
-LazyTensor::LazyTensor(const TransformType& type, const Transform& transform)
-    : m_impl(std::make_shared<LazyTensorImpl>(type, transform)) {}
-
 bool LazyTensor::operator==(const LazyTensor& other) const {
     return *m_impl.get() == *other.m_impl.get();
 }
@@ -257,37 +240,20 @@ bool LazyTensor::operator!=(const LazyTensor& other) const {
     return !(*m_impl.get() == *other.m_impl.get());
 }
 
-void LazyTensor::update(const TransformType& type, const Transform& transform) {
-    const auto& curr = m_impl;
-    auto new_lt = std::make_shared<LazyTensorImpl>();
-
-    new_lt->m_orig_data = curr->m_orig_data;
-    new_lt->m_orig_shape = curr->m_orig_shape;
-    new_lt->m_orig_type = curr->m_orig_type;
-
-    new_lt->m_transform = std::make_pair(type, transform);
-    new_lt->m_parent = curr;
-    new_lt->m_hash = new_lt->get_hash();
-
-    m_impl = new_lt;
-}
-
 ov::Tensor LazyTensor::eval() const {
+    if (!m_impl) {
+        return ov::Tensor();
+    }
     return m_impl->eval();
 }
 
-ov::Tensor LazyTensor::get_orig_tensor() const {
-    return m_impl->get_orig_tensor();
-}
-
 std::size_t LazyTensor::get_hash() const {
+    if (!m_impl) {
+        return 0;
+    }
     return m_impl->get_hash();
 }
 
 std::size_t LazyTensor::Hash::operator()(const LazyTensor& lt) const {
     return lt.get_hash();
 }
-
-bool LazyTensor::has_transformations() const {
-    return m_impl->has_transformations();
-}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp
index 5cdeeba058e45f..365d9d636872b8 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp
@@ -5,33 +5,17 @@
 #pragma once
 
 #include <memory>
-#include <mutex>
-#include <tuple>
-#include <unordered_map>
-#include <variant>
 
-#include "logging.hpp"
-#include "openvino/runtime/make_tensor.hpp"
+#include "openvino/op/constant.hpp"
 #include "openvino/runtime/tensor.hpp"
-#include "util.hpp"
 
 namespace ov {
 namespace npuw {
 namespace weights {
-
-enum class TransformType : int { THIS, PERMUTE, CONVERT, CONCAT, UNPACK };
-
 // Forward declaration
 class LazyTensor;
 struct LazyTensorImpl;
 
-using ConcatMeta = std::pair<std::vector<LazyTensor>, std::size_t>;
-using UnpackMeta = std::tuple<LazyTensor, LazyTensor, LazyTensor, ov::Shape, ov::element::Type>;
-using ConstPtr = std::shared_ptr<ov::op::v0::Constant>;
-using OrigData = std::variant<ConstPtr, ov::Tensor>;
-
-using Transform = std::variant<OrigData, std::vector<std::size_t>, std::monostate, ConcatMeta, UnpackMeta>;
-
 class LazyTensor {
 public:
     class Hash {
@@ -40,17 +24,23 @@ class LazyTensor {
     };
 
     LazyTensor() = default;
-    LazyTensor(const TransformType& type, const Transform& transform);
+    LazyTensor(const std::shared_ptr<ov::op::v0::Constant>& const_ptr);
+    LazyTensor(const std::vector<LazyTensor>& to_concat, const std::size_t axis);  // construct from concat
+    LazyTensor(const LazyTensor& cw,
+               const LazyTensor& cz,
+               const LazyTensor& cs,
+               const ov::element::Type& type,
+               const ov::Shape& shape);  // construct from unpack
+
+    LazyTensor permute(const std::vector<std::size_t>& axes);
+    LazyTensor convert(const ov::element::Type& type);
 
     bool operator==(const LazyTensor& other) const;
     bool operator!=(const LazyTensor& other) const;
 
-    void update(const TransformType& type, const Transform& transform);
     ov::Tensor eval() const;
 
-    ov::Tensor get_orig_tensor() const;
     std::size_t get_hash() const;
-    bool has_transformations() const;
 
 private:
     std::shared_ptr<LazyTensorImpl> m_impl = nullptr;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index 99705fef30e8a8..2ff41be4c19f78 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -1525,8 +1525,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) {
 
                 LOG_DEBUG("Register " << prod_output << " in the function closure");
                 funcall._lazy_closure.push_back(
-                    LazyTensor(TransformType::THIS,
-                               std::static_pointer_cast<ov::op::v0::Constant>(input_node)));  // (n)/1/i/c
+                    LazyTensor(std::static_pointer_cast<ov::op::v0::Constant>(input_node)));  // (n)/1/i/c
             } else if (ov::op::util::is_parameter(input_node)) {
                 LOG_DEBUG("Handling a Parameter input " << prod_output);
                 LOG_BLOCK();
@@ -1695,8 +1694,7 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) {
                     LOG_DEBUG("Register " << prod_output << " in the function closure[" << param_idx
                                           << "] (via prototype " << proto_layer_name << ")");
                     funcall._lazy_closure[param_idx - function._param_offset] =
-                        LazyTensor(TransformType::THIS,
-                                   std::static_pointer_cast<ov::op::v0::Constant>(input_node));  // (t)/1/c
+                        LazyTensor(std::static_pointer_cast<ov::op::v0::Constant>(input_node));  // (t)/1/c
                 }
             }  // for (inputs)
         }      // for(nodes)
@@ -1765,7 +1763,7 @@ void Partitioner::optimize(const std::string& func_name) {
             auto closure_idx = param_idx - f._param_offset;
             ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
                 auto& funcall = func_group.refs[f_idx].get();
-                funcall._lazy_closure[closure_idx].update(TransformType::PERMUTE, p.second);
+                funcall._lazy_closure[closure_idx] = funcall._lazy_closure[closure_idx].permute(p.second);
             });
         }
     };
@@ -1775,7 +1773,7 @@ void Partitioner::optimize(const std::string& func_name) {
             auto closure_idx = param_idx - f._param_offset;
             ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
                 auto& funcall = func_group.refs[f_idx].get();
-                funcall._lazy_closure[closure_idx].update(TransformType::CONVERT, std::monostate{});
+                funcall._lazy_closure[closure_idx] = funcall._lazy_closure[closure_idx].convert(ov::element::f16);
             });
         }
     };
@@ -1830,15 +1828,12 @@ void Partitioner::optimize(const std::string& func_name) {
                 std::vector<LazyTensor> to_concat;
                 // Fill tensor vector
                 for (auto&& cidx : to_concat_idx) {
-                    // FIXME: Assuming here concat goes first and other transformations later.
-                    //        This allows to store ov::Tensor and ignore their potential history of transformations
-                    NPUW_ASSERT(!funcall._lazy_closure[cidx].has_transformations());
                     to_concat.push_back(funcall._lazy_closure[cidx]);
                 }
                 // Note: we can ignore updating funcall._lazy_closure[cidx] here since those LazyTensors will be gone
                 // and the new one added into the vector
                 if (!to_concat.empty()) {
-                    funcall._lazy_closure.push_back(LazyTensor(TransformType::CONCAT, std::make_pair(to_concat, axis)));
+                    funcall._lazy_closure.push_back(LazyTensor(to_concat, axis));
                     // Some of the tensors might be in closure - preserve it's 1:1 idx mapping with _lazy_closure
                     funcall._closure.push_back(ov::Tensor());
                 }
@@ -1865,17 +1860,11 @@ void Partitioner::optimize(const std::string& func_name) {
 
             ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
                 auto& funcall = func_group.refs[f_idx].get();
-                // FIXME: assuming no transformations were applied to the tensor - since we are utilizing the original
-                // ov::Tensor below
                 LazyTensor cw = funcall._lazy_closure[w_idx - f._param_offset];
-                LazyTensor cz = z_idx != -1 ? funcall._lazy_closure[z_idx - f._param_offset]
-                                            : LazyTensor(TransformType::THIS, ov::Tensor());
+                LazyTensor cz = z_idx != -1 ? funcall._lazy_closure[z_idx - f._param_offset] : LazyTensor();
                 LazyTensor cs = funcall._lazy_closure[s_idx - f._param_offset];
-
-                // FIXME: currently there is an issue that we don't share such tensor between head and tail
                 funcall._lazy_closure.push_back(
-                    LazyTensor(TransformType::UNPACK,
-                               std::make_tuple(cw, cz, cs, p.first->get_shape(), p.first->get_element_type())));
+                    LazyTensor(cw, cz, cs, p.first->get_element_type(), p.first->get_shape()));
                 // Some of the tensors might be in closure - preserve it's 1:1 idx mapping with _lazy_closure
                 funcall._closure.push_back(ov::Tensor());
             });
@@ -1899,7 +1888,7 @@ void Partitioner::optimize(const std::string& func_name) {
                 // Based on our logic (when tensors get transferred from lazy tensors via bank
                 // to the closure), this tensor should be non-empty to avoid this process.
                 funcall.get()._closure.push_back(ov::Tensor(new_elem_type, new_shape));
-                funcall.get()._lazy_closure.push_back(LazyTensor(TransformType::THIS, ov::Tensor()));
+                funcall.get()._lazy_closure.push_back(LazyTensor());
             }
         }
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
index 60f705a0c8f26c..641ee7690f4d34 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
@@ -94,8 +94,14 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) {
         } else if (ban_list.find(param) == ban_list.end()) {
             // If it's not in the ban list, it's an OK parameter and should be kept
             LOG_DEBUG("This is an OK parameter, will be kept");
-            m.weights_to_unpack.insert(i - fbody._param_offset);
             m.closure_remap.push_back(i - fbody._param_offset);
+
+            // Check if unpack is indeed required
+            const auto& type = param->get_element_type();
+            if (type == ov::element::i4 || type == ov::element::u4 || type == ov::element::i8 ||
+                type == ov::element::u8) {
+                m.weights_to_unpack.insert(i - fbody._param_offset);
+            }
         }
 
         // Process zero points for parameters

From cbb3760568891af2cb8c8938793cb1d96e51e11f Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Fri, 29 Nov 2024 12:50:14 +0000
Subject: [PATCH 3/3] NPUW memory and L0 pipeline hotfixes

---
 .../intel_npu/src/plugin/npuw/compiled_model.cpp       |  5 +++--
 .../src/plugin/npuw/partitioning/patterns/dcoff.cpp    | 10 ++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 8770dee0d68eea..e3cffa91bc681a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -705,8 +705,9 @@ std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infe
         const auto num_submodels = m_compiled_submodels.size();
         for (std::size_t idx = 0u; idx < num_submodels; idx++) {
             const auto& comp_model_desc = m_compiled_submodels[idx];
-            if (!comp_model_desc.replaced_by.has_value()) {
-                // not a funcall, do nothing
+            if (!comp_model_desc.replaced_by.has_value() || comp_model_desc.forced_to_fcall) {
+                // not a funcall, do nothing, or a subgraph that was forced to funcall
+                // (a 1-call function) - skip
                 continue;
             }
             const auto real_idx = comp_model_desc.replaced_by.value();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
index 641ee7690f4d34..61963e2aea5ca6 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
@@ -96,12 +96,10 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) {
             LOG_DEBUG("This is an OK parameter, will be kept");
             m.closure_remap.push_back(i - fbody._param_offset);
 
-            // Check if unpack is indeed required
-            const auto& type = param->get_element_type();
-            if (type == ov::element::i4 || type == ov::element::u4 || type == ov::element::i8 ||
-                type == ov::element::u8) {
-                m.weights_to_unpack.insert(i - fbody._param_offset);
-            }
+            // FIXME: type should be queried from a lazy tensor
+            // and compared against param->get_element_type()
+            // to decide 100%
+            m.weights_to_unpack.insert(i - fbody._param_offset);
         }
 
         // Process zero points for parameters