From 870bb7979169fc5d476b7397429a716ccefcada7 Mon Sep 17 00:00:00 2001
From: Eugene Smirnov <eugene.smirnov@intel.com>
Date: Fri, 29 Nov 2024 16:53:32 +0000
Subject: [PATCH] NPUW: Unfold infer requests (#27319) (#27825)

Chery-picked PR:  https://github.com/openvinotoolkit/openvino/pull/27319

Co-authored-by: Dmitry Matveev <dmitry.matveev@intel.com>
---
 .../src/al/include/intel_npu/config/npuw.hpp  |   1 +
 .../intel_npu/npuw_private_properties.hpp     |   8 +
 .../intel_npu/src/al/src/config/npuw.cpp      |   1 +
 .../plugin/npuw/base_sync_infer_request.cpp   | 291 ++++++++++++++++-
 .../plugin/npuw/base_sync_infer_request.hpp   |  37 ++-
 .../src/plugin/npuw/compiled_model.cpp        |  97 ++++--
 .../src/plugin/npuw/compiled_model.hpp        |  14 +-
 .../plugin/npuw/just_sync_infer_request.cpp   | 308 ++----------------
 .../plugin/npuw/just_sync_infer_request.hpp   |  34 +-
 .../plugin/npuw/unfold_sync_infer_request.cpp | 140 ++++++++
 .../plugin/npuw/unfold_sync_infer_request.hpp |  42 +++
 11 files changed, 621 insertions(+), 352 deletions(-)
 create mode 100644 src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
 create mode 100644 src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
index bdda589a7bcb7b..6d865ad5e4edf3 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -57,6 +57,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
 DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
 DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
+DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
 DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
 DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
 DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
index 648bcde0cdc913..af4a17988f451e 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -287,6 +287,14 @@ static constexpr ov::Property<bool> parallel_compilation{"NPUW_PARALLEL_COMPILE"
  */
 static constexpr ov::Property<bool> funcall_async{"NPUW_FUNCALL_ASYNC"};
 
+/**
+ * @brief
+ * Type: boolean
+ * Create individual infer requests for partitiongs, even repeating.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> unfold_ireqs{"NPUW_UNFOLD_IREQS"};
+
 namespace accuracy {
 /**
  * @brief
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
index 84ac94d1f7c67b..0c7978845c690c 100644
--- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -39,6 +39,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_FUNCALL_FOR_ALL>();
     desc.add<NPUW_PARALLEL_COMPILE>();
     desc.add<NPUW_FUNCALL_ASYNC>();
+    desc.add<NPUW_UNFOLD_IREQS>();
     desc.add<NPUW_WEIGHTS_BANK>();
     desc.add<NPUW_WEIGHTS_BANK_ALLOC>();
     desc.add<NPUW_CACHE_DIR>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
index 216b1a35b4315c..77d000cb415de7 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
@@ -7,6 +7,7 @@
 #include "compiled_model.hpp"
 #include "intel_npu/config/npuw.hpp"
 #include "logging.hpp"
+#include "openvino/core/parallel.hpp"
 #include "util.hpp"
 
 ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
@@ -58,12 +59,8 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
             LOG_INFO("- Trying next device...");
             comp_model_desc.device_it++;
             can_try_again = m_npuw_model->compile_for_success(id);
-            if (can_try_again) {
-                if (recompiled)
-                    *recompiled = true;
-                // Probably shouldn't be called all the time, but only if
-                // I/O submodel is affected
-                m_npuw_model->reset_io();
+            if (can_try_again && recompiled) {
+                *recompiled = true;
             }
         }
     }  // while(!new_ireq && can_try_again)
@@ -178,6 +175,33 @@ void ov::npuw::IBaseInferRequest::check_tensors() const {
     return;
 }
 
+std::vector<ov::SoPtr<ov::IVariableState>> ov::npuw::IBaseInferRequest::query_state() const {
+    std::vector<ov::SoPtr<ov::IVariableState>> variable_states = {};
+    for (const auto& request : m_subrequests) {
+        if (!request)  // optimized out
+            continue;
+        for (auto&& state : request->query_state()) {
+            if (!state._so)
+                state._so = request._so;
+            variable_states.emplace_back(state);
+        }
+    }
+    return variable_states;
+}
+
+std::vector<ov::ProfilingInfo> ov::npuw::IBaseInferRequest::get_profiling_info() const {
+    std::vector<ov::ProfilingInfo> info;
+    for (size_t i = 0; i < m_subrequests.size(); ++i) {
+        if (!m_subrequests[i])  // optimized out
+            continue;
+        auto&& subreq_info = m_subrequests[i]->get_profiling_info();
+        for (auto&& rec : subreq_info)
+            rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name;
+        info.insert(info.end(), subreq_info.begin(), subreq_info.end());
+    }
+    return info;
+}
+
 void ov::npuw::IBaseInferRequest::infer() {
     m_now_idx.reset();
     prepare_for_infer();
@@ -209,6 +233,261 @@ void ov::npuw::IBaseInferRequest::infer() {
     m_now_idx.reset();
 }
 
+std::size_t ov::npuw::IBaseInferRequest::total_subrequests() const {
+    return m_subrequests.size();
+}
+
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocMem(const ov::element::Type type,
+                                                          const ov::Shape& shape,
+                                                          const std::string& device) {
+    if (device == "CPU" || ov::shape_size(shape) == 0) {
+        return ov::get_tensor_impl(ov::Tensor(type, shape));
+    }
+
+    auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
+    auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
+    return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
+}
+
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocOut(const ov::Output<const ov::Node>& node,
+                                                          const std::string& device) {
+    return allocMem(node.get_element_type(), node.get_shape(), device);
+}
+
+void ov::npuw::IBaseInferRequest::alloc_io() {
+    // Preallocate input tensors
+    LOG_INFO("Preallocating input tensors...");
+    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
+        const auto& port = m_npuw_model->inputs()[i];
+        ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
+        m_input_tensors.push_back(allocated);
+        m_input_allocated.insert(allocated->data());
+        m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
+    }  // for(inputs)
+
+    // Preallocate output tensors
+    LOG_INFO("Preallocating output tensors...");
+    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
+        LOG_BLOCK();
+        const auto& port = m_npuw_model->outputs()[i];
+        LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);
+
+        // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
+        const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
+        LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
+
+        auto tensor = alloc_global_out(i);
+        m_output_tensors.push_back(tensor);
+        m_port_to_tensor[port] = TensorStorage{tensor, true};
+    }
+}
+
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) {
+    const auto& port = m_npuw_model->outputs().at(out_idx);
+    return allocOut(port, m_npuw_model->global_mem_device());
+}
+
+void ov::npuw::IBaseInferRequest::init_gio() {
+    // Build the parameter/result mapping
+    m_subrequests_gio.resize(m_subrequests.size());
+
+    // Parameters: stage 1...
+    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
+        const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
+        if (to_submodel != CompiledModel::NO_LINK) {
+            std::size_t sub_idx{}, in_idx{};
+            std::tie(sub_idx, in_idx) = to_submodel;
+            m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
+        }
+    }  // for(inputs)
+
+    // Parameters: stage 2...
+    for (auto&& it : m_npuw_model->m_param_subscribers) {
+        const auto param_idx = it.first;
+        for (auto&& to_submodel : it.second) {
+            std::size_t sub_idx{}, in_idx{};
+            std::tie(sub_idx, in_idx) = to_submodel;
+            m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx;
+        }
+    }
+
+    // Results
+    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
+        std::size_t sub_idx{}, out_idx{};
+        std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
+        m_subrequests_gio.at(sub_idx).global_results[i] = out_idx;
+    }
+}
+
+void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request) {
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+
+    NPUW_ASSERT(comp_model_desc.replaced_by);
+    const auto real_idx = comp_model_desc.replaced_by.value();
+    auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx];
+
+    // Bind extra parameters from the function's closure
+    // First, do easy things & delay heavy stuff
+    std::vector<std::size_t> closure_unpack_required;
+    std::vector<std::size_t> closure_copy_required;
+
+    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
+        auto& closure = comp_model_desc.closure[cidx];
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+
+        if (m_npuw_model->is_gather_closure(idx, cidx)) {
+            // No need to set/copy the host_gather's closure tensor int
+            // the subrequest - it is just a dummy. host_gather writes
+            // to the right buffer directly.
+            continue;
+        }
+
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        if (m_npuw_model->unpack_required(idx, cidx)) {
+            // Remember where the unpack is required
+            closure_unpack_required.push_back(cidx);
+        } else {
+            if (needs_copy(idx, cidx)) {
+                // Remember where copy is requried
+                closure_copy_required.push_back(cidx);
+            } else {
+                // Easy case, just set one to another
+                request->set_tensor(iport, ov::get_tensor_impl(closure));
+            }
+        }
+    }  // for(closure)
+
+    // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
+    ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
+        auto cidx = closure_copy_required[j];
+        auto& closure = comp_model_desc.closure[cidx];
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        auto clparam = request->get_tensor(iport);
+        ov::get_tensor_impl(closure)->copy_to(clparam._ptr);
+    });
+    // }); // ms_to_run
+
+    for (std::size_t j = 0; j != closure_unpack_required.size(); j++) {
+        // NB: No need to protect anything here as containers are all
+        // preallocated and we only access elements under particular (thread
+        // -local) indices.
+        auto cidx = closure_unpack_required[j];
+
+        // FIXME: zerops are stored with absolute indexing, this needs to be aligned
+        auto& closure = comp_model_desc.closure[cidx];
+
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        auto clparam = request->get_tensor(iport);
+
+        if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) {
+            // Unpacking this weight requires scaling with zero points...
+            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
+                                   ov::get_tensor_impl(comp_model_desc.zerops[cidx]),
+                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
+                                   clparam);
+        } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) {
+            // Unpacking this weight requires scaling
+            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
+                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
+                                   clparam);
+        } else {
+            // Unpacking this weight doesn't require scaling
+            ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam);
+        }
+    }
+}
+
+void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr request) {
+    LOG_DEBUG("Binding parameters for Subgraph[" << idx << "]");
+    LOG_BLOCK();
+
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+    const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
+
+    const bool do_copy = needs_copy(idx);
+    const auto& iodesc = m_subrequests_gio.at(idx);
+
+    const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    const bool is_spatial = proto_comp_model_desc.spatial.has_value();
+
+    // a list of ports to copy tensors, if needed: FROM -> TO
+    std::vector<std::pair<ov::SoPtr<ov::ITensor>, ov::Output<const ov::Node>>> copy_list;
+
+    // Check if the given subgraph's input is spatial
+    auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool {
+        if (!is_spatial) {
+            return false;  // Early return
+        }
+        auto& spatial = proto_comp_model_desc.spatial.value();
+        return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool {
+            return p.idx == sub_in_idx;
+        });
+    };
+
+    for (auto&& it : iodesc.global_params) {
+        std::size_t param_idx{}, sub_in_idx{};
+        std::tie(param_idx, sub_in_idx) = it;
+        LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl);
+
+        const auto& g_port = m_npuw_model->inputs()[param_idx];
+        const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
+        const auto& s_port = request->get_inputs()[sub_in_idx];
+        LOG_DEBUG("Processing " << g_port << " -> " << s_port << "...");
+        LOG_BLOCK();
+        if (!is_spatial_param(sub_in_idx)) {
+            // Input parameter is non-spatial, do normal handling
+            if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) {
+                LOG_DEBUG("Will be copied");
+                copy_list.emplace_back(g_tnsr, s_port);
+            } else {
+                LOG_DEBUG("Will be set");
+                request->set_tensor(s_port, g_tnsr);
+            }
+        } else {
+            // Register for future use
+            m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr;
+        }
+    }
+
+    LOG_DEBUG("Running copy...");
+    ov::parallel_for(copy_list.size(), [&](std::size_t idx) {
+        auto& it = copy_list[idx];
+        ov::SoPtr<ov::ITensor> dst = request->get_tensor(it.second);
+        it.first->copy_to(dst._ptr);
+    });
+
+    // Run host-side gather, if required
+    if (comp_model_desc.host_gather.dst_idx != -1) {
+        const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx];
+        const auto gather = request->get_tensor(gport);
+
+        const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
+        const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
+        const auto lookup = request->get_tensor(lport);
+        ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather);
+    }
+
+    LOG_DEBUG("Done");
+}
+
+void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr request) {
+    LOG_DEBUG("Binding results for Subgraph[" << idx << "]");
+    LOG_BLOCK();
+
+    const auto& iodesc = m_subrequests_gio.at(idx);
+    for (auto&& it : iodesc.global_results) {
+        std::size_t result_idx{}, sub_out_idx{};
+        std::tie(result_idx, sub_out_idx) = it;
+        const auto& g_port = m_npuw_model->outputs()[result_idx];
+        const auto& s_port = request->get_outputs()[sub_out_idx];
+        request->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
+    }
+
+    LOG_DEBUG("Done");
+}
+
 void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
     const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
     const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
index 6be64d676d6149..ae24dcfee11f9d 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
@@ -19,8 +19,15 @@
 namespace ov {
 namespace npuw {
 
+using TensorPtr = ov::SoPtr<ov::ITensor>;
+
 class CompiledModel;
 
+using LinkFrom = std::pair<std::size_t /* Subrequest index */
+                           ,
+                           std::size_t /* Subrequest output index */
+                           >;          // FIXME: This is a third, if not fourth, definitiion of such structure
+
 // This interface is provided to npuw::AsyncInferRequest to manage the
 // individual subrequests' execution
 class IBaseInferRequest : public ov::ISyncInferRequest {
@@ -40,6 +47,10 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
 
     void check_tensors() const override;
 
+    // Query APIs - some default implementations here
+    std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
+    std::vector<ov::ProfilingInfo> get_profiling_info() const override;
+
     using sptr = std::shared_ptr<IBaseInferRequest>;
     using Completed = std::function<void(std::exception_ptr)>;
 
@@ -50,7 +61,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     virtual void run_subrequest_for_success(std::size_t idx, bool& failover) = 0;
     virtual void complete_subrequest(std::size_t idx) = 0;
     virtual void cancel_subrequest(std::size_t idx) = 0;
-    virtual std::size_t total_subrequests() const = 0;
+    virtual std::size_t total_subrequests() const;
     virtual bool supports_async_pipeline() const = 0;
 
 protected:
@@ -107,8 +118,32 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     };
     std::vector<SpatialIO> m_spatial_io;
 
+    // This structure tracks how every individual subrequest
+    // access the model's top-level (global, public, etc) parameters
+    // and results. Again, is managed by subclasses
+    struct GlobalIO {
+        using map_t = std::map<std::size_t, std::size_t>;
+        map_t global_params;   // param idx -> input idx
+        map_t global_results;  // result idx -> output idx
+    };
+    std::vector<GlobalIO> m_subrequests_gio;
+
+    // Tracks tensors we allocated on our own - to recognize and avoid copies
+    std::unordered_set<void*> m_input_allocated;
+
+    // Common functionality - shared for subclasses
     const std::size_t m_num_submodels;
 
+    TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
+    TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
+    virtual void alloc_io();
+    virtual TensorPtr alloc_global_out(std::size_t out_idx);
+
+    virtual void init_gio();
+    void unpack_closure(std::size_t idx, RqPtr request);
+    virtual void bind_global_params(std::size_t idx, RqPtr request);
+    virtual void bind_global_results(std::size_t idx, RqPtr request);
+
     void dump_input_tensors(std::size_t idx);
     void dump_output_tensors(std::size_t idx);
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 4110307ec1623e..c6be2793fe6f70 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -21,6 +21,7 @@
 #include "openvino/util/common_util.hpp"
 #include "partitioning/patterns/opt.hpp"
 #include "plugin.hpp"
+#include "unfold_sync_infer_request.hpp"
 #include "util.hpp"
 
 // required for get_properties_per_device()
@@ -442,9 +443,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
     }
 
     implement_properties();
-
-    m_finalized = true;
-    reset_io();
+    report_io();
 }
 
 void ov::npuw::CompiledModel::finalize_weights_bank() {
@@ -570,19 +569,7 @@ void ov::npuw::CompiledModel::fill_empty_tensor_names(const std::shared_ptr<ov::
     }
 }
 
-void ov::npuw::CompiledModel::reset_io() {
-    // Restore inputs/outputs from compiled submodels
-    // FIXME: this method is also called from IBaseInferReqeust::create_infer_request
-    // which is called in the CompiledModel(). So this method executes even before it
-    // is called for the right thing from the ctor directly.
-    if (!m_finalized)
-        return;  // avoid getting called before it is really the time
-
-    // Don't be like HETERO here - don't override the inputs/outputs(),
-    // as the ICompiledModel already creates one for us.
-    // Instead, remember the mapping from the original CompiledModel::input/output ports
-    // to the Subgraph's input/output ports.
-
+void ov::npuw::CompiledModel::report_io() const {
     LOG_VERB("*** Partition graph ***");
     int idx_in = 0, idx_out = 0;  // FIXME: use indexed()
     for (const auto& to_submodel : m_inputs_to_submodels_inputs) {
@@ -708,16 +695,41 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de
     }
 }
 
-std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
-    auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
-    return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
-}
-
 std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {
     // Synchronous infer request implementation may vary based on the
     // selected strategy
     auto* non_const_this = const_cast<ov::npuw::CompiledModel*>(this);  // because of const in API
-    return non_const_this->create_just_sync_infer_request();
+    auto non_const_this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(non_const_this->shared_from_this());
+
+    auto no_spatial_unpack = [&]() {
+        const auto num_submodels = m_compiled_submodels.size();
+        for (std::size_t idx = 0u; idx < num_submodels; idx++) {
+            const auto& comp_model_desc = m_compiled_submodels[idx];
+            if (!comp_model_desc.replaced_by.has_value()) {
+                // not a funcall, do nothing
+                continue;
+            }
+            const auto real_idx = comp_model_desc.replaced_by.value();
+            if (m_compiled_submodels[real_idx].spatial) {
+                LOG_WARN("Subgraph[" << idx << "] is a call to spatial function, unfold can't be done");
+                return false;  // Spatial graph
+            }
+            if (unpack_required(idx)) {
+                LOG_WARN("Subgraph[" << idx << "] requires unpack, unfold can't be done");
+                return false;  // Unpack required
+            }
+        }
+        return true;  // no spatial & subgraphs requiring unpack found
+    };
+
+    std::shared_ptr<ov::ISyncInferRequest> result;
+    if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>() && no_spatial_unpack()) {
+        result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr));
+    } else {
+        result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr));
+    }
+    NPUW_ASSERT(result);
+    return result;
 }
 
 std::shared_ptr<ov::IAsyncInferRequest> ov::npuw::CompiledModel::create_infer_request() const {
@@ -776,6 +788,46 @@ std::string ov::npuw::CompiledModel::submodel_device(const std::size_t idx) cons
     return *comp_subm_desc.device_it;
 }
 
+bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx) const {
+    auto& comp_model_desc = m_compiled_submodels.at(idx);
+    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
+        if (unpack_required(idx, cidx)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx, const std::size_t cidx) const {
+    if (is_gather_closure(idx, cidx)) {
+        return false;
+    }
+
+    auto& comp_model_desc = m_compiled_submodels.at(idx);
+    const auto real_idx = comp_model_desc.replaced_by.value();
+    auto& func_desc = m_compiled_submodels.at(real_idx);
+
+    auto& closure = comp_model_desc.closure.at(cidx);
+    const auto closure_param_id = comp_model_desc.param_base + cidx;
+
+    auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+    return (closure.get_element_type() != iport.get_element_type());
+}
+
+bool ov::npuw::CompiledModel::is_gather_closure(const std::size_t idx, const std::size_t cidx) const {
+    auto& comp_model_desc = m_compiled_submodels.at(idx);
+    const auto real_idx = comp_model_desc.replaced_by.value();
+    auto& func_desc = m_compiled_submodels.at(real_idx);
+
+    const auto closure_param_id = comp_model_desc.param_base + cidx;
+
+    if (func_desc.host_gather.dst_idx != -1 &&
+        static_cast<uint64_t>(func_desc.host_gather.dst_idx) == closure_param_id) {
+        return true;
+    }
+    return false;
+}
+
 void ov::npuw::CompiledModel::log_device_dist() const {
     std::unordered_map<std::string, execution_stats> stats_for_devices;
     execution_stats stats_for_optimized_out{0.f, 0ul};
@@ -935,6 +987,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
                           BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
                           BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
+                          BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
                           BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
                           BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
                           BIND(npuw::cache_dir, NPUW_CACHE_DIR),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index 6199ac66c0c64e..ece1bc78fb5bf5 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel {
     // FIXME: This class has many friends..
     friend class IBaseInferRequest;
     friend class JustInferRequest;
+    friend class UnfoldInferRequest;
     friend class MemAccessSim;
     friend class FuncMemMgr;
 
@@ -57,28 +58,27 @@ class CompiledModel : public ov::ICompiledModel {
 
     void dump_on_fail(std::size_t id, const std::string& device_to_stry, const char* extra);
 
-    bool m_finalized = false;
-    void reset_io();
+    void report_io() const;
 
     // This is used for removing too long output tensor names to fix some compilation issues
+    // NB: These two methods has nothing to do with this particular class and should be
+    // moved elsewhere
     void remove_long_output_names(const std::shared_ptr<ov::Model>& model);
     void fill_empty_tensor_names(const std::shared_ptr<ov::Model>& model);
 
     std::shared_ptr<const ::intel_npu::Plugin> get_npuw_plugin() const;
-
-    std::shared_ptr<ov::ISyncInferRequest> create_just_sync_infer_request();
     std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
 
     std::string submodel_device(const std::size_t idx) const;
+    bool is_gather_closure(const std::size_t idx, const std::size_t cidx) const;
+    bool unpack_required(const std::size_t idx) const;
+    bool unpack_required(const std::size_t idx, const std::size_t cidx) const;
 
     void log_device_dist() const;
-
     void implement_properties();
 
     void finalize_weights_bank();
-
     std::string global_mem_device() const;
-
     std::string funcall_mem_device(const std::size_t idx) const;
 
     std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
index 0e0b96582a663c..8d1c7c4a30acde 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -310,69 +310,9 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         }
     }  // if(function_pipelining)
 
-    // Preallocate input tensors
-    LOG_INFO("Preallocating input tensors...");
-    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
-        const auto& port = m_npuw_model->inputs()[i];
-        ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
-        m_input_tensors.push_back(allocated);
-        m_input_allocated.insert(allocated->data());
-        m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
-    }  // for(inputs)
-
-    // Preallocate output tensors
-    LOG_INFO("Preallocating output tensors...");
-    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
-        LOG_BLOCK();
-        const auto& port = m_npuw_model->outputs()[i];
-        LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);
-
-        // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
-        const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
-
-        LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
-        auto funcall_result_iter = m_funcall_result.find(from_submodel);
-
-        const auto& tensor =
-            funcall_result_iter != m_funcall_result.end()
-                ? funcall_result_iter->second  // Function calls have their tensors allocated, so just use one
-                : allocOut(port, m_npuw_model->global_mem_device());
-
-        m_output_tensors.push_back(tensor);
-        m_port_to_tensor[port] = TensorStorage{tensor, true};
-    }
+    alloc_io();
     connect_subrequests();
-
-    // Build the parameter/result mapping {{{
-    m_subrequests_gio.resize(m_subrequests.size());
-
-    // Parameters: stage 1...
-    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
-        const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
-        if (to_submodel != CompiledModel::NO_LINK) {
-            std::size_t sub_idx{}, in_idx{};
-            std::tie(sub_idx, in_idx) = to_submodel;
-            m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
-        }
-    }  // for(inputs)
-
-    // Parameters: stage 2...
-    for (auto&& it : m_npuw_model->m_param_subscribers) {
-        const auto param_idx = it.first;
-        for (auto&& to_submodel : it.second) {
-            std::size_t sub_idx{}, in_idx{};
-            std::tie(sub_idx, in_idx) = to_submodel;
-            m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx;
-        }
-    }
-
-    // Results
-    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
-        std::size_t sub_idx{}, out_idx{};
-        std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
-        m_subrequests_gio.at(sub_idx).global_results[i] = out_idx;
-    }
-    // }}}
+    init_gio();
 
     for (size_t i = 0; i < m_num_submodels; i++) {
         LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]...");
@@ -413,6 +353,15 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
     }
 }
 
+ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) {
+    const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(out_idx);
+    auto funcall_result_iter = m_funcall_result.find(from_submodel);
+    if (funcall_result_iter != m_funcall_result.end()) {
+        return funcall_result_iter->second;
+    }
+    return IBaseInferRequest::alloc_global_out(out_idx);
+}
+
 void ov::npuw::JustInferRequest::connect_subrequests() {
     LOG_INFO("Connecting subrequests...");
     LOG_BLOCK();
@@ -478,33 +427,6 @@ void ov::npuw::JustInferRequest::connect_subrequests() {
     LOG_INFO("Done");
 }
 
-std::vector<ov::SoPtr<ov::IVariableState>> ov::npuw::JustInferRequest::query_state() const {
-    std::vector<ov::SoPtr<ov::IVariableState>> variable_states = {};
-    for (const auto& request : m_subrequests) {
-        if (!request)  // optimized out
-            continue;
-        for (auto&& state : request->query_state()) {
-            if (!state._so)
-                state._so = request._so;
-            variable_states.emplace_back(state);
-        }
-    }
-    return variable_states;
-}
-
-std::vector<ov::ProfilingInfo> ov::npuw::JustInferRequest::get_profiling_info() const {
-    std::vector<ov::ProfilingInfo> info;
-    for (size_t i = 0; i < m_subrequests.size(); ++i) {
-        if (!m_subrequests[i])  // optimized out
-            continue;
-        auto&& subreq_info = m_subrequests[i]->get_profiling_info();
-        for (auto&& rec : subreq_info)
-            rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name;
-        info.insert(info.end(), subreq_info.begin(), subreq_info.end());
-    }
-    return info;
-}
-
 void ov::npuw::JustInferRequest::prepare_for_infer() {
     LOG_DEBUG("Preparing to infer...");
     LOG_BLOCK();
@@ -542,118 +464,36 @@ void ov::npuw::JustInferRequest::start_subrequest(std::size_t idx) {
 }
 
 void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
-    LOG_DEBUG("Binding parameters for Subgraph[" << idx << "]");
-    LOG_BLOCK();
-
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
 
-    const bool do_copy = needs_copy(idx);
-    const auto& iodesc = m_subrequests_gio.at(idx);
-
-    const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
-    const bool is_spatial = proto_comp_model_desc.spatial.has_value();
-
-    // a list of ports to copy tensors, if needed: FROM -> TO
-    std::vector<std::pair<ov::SoPtr<ov::ITensor>, ov::Output<const ov::Node>>> copy_list;
-
     // pick which subrequest we actually work on here
-    auto subr = [&]() {
-        if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) {
-            LOG_DEBUG("Accessing the pipeline subrequest");
-            // The real index of request we need to prepare IS
-            // the same request which executes now AND
-            // function_pipelining enabled - select the reserve request.
-            NPUW_ASSERT(m_funcall_pipeline[real_idx].subrequest);
-            return m_funcall_pipeline[real_idx].subrequest;
-        }
+    if (now_idx() && real_idx == real(now_idx().value()) && is_pipelined(now_idx().value())) {
+        LOG_DEBUG("Accessing the pipeline subrequest");
+        // The real index of request we need to prepare IS
+        // the same request which executes now AND
+        // function_pipelining enabled - select the reserve request.
+        NPUW_ASSERT(m_funcall_pipeline[real_idx].subrequest);
+        bind_global_params(idx, m_funcall_pipeline[real_idx].subrequest);
+    } else {
         // Otherwise: Just a return a subrequest which is in place.
         // If it is a function call and we have function pipelining ON,
         // it is still the right subrequest we can use.
         LOG_DEBUG("Accessing the primary subrequest");
-        return m_subrequests[real_idx];
-    }();
-
-    // Check if the given subgraph's input is spatial
-    auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool {
-        if (!is_spatial) {
-            return false;  // Early return
-        }
-        auto& spatial = proto_comp_model_desc.spatial.value();
-        return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool {
-            return p.idx == sub_in_idx;
-        });
-    };
-
-    for (auto&& it : iodesc.global_params) {
-        std::size_t param_idx{}, sub_in_idx{};
-        std::tie(param_idx, sub_in_idx) = it;
-        LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl);
-
-        const auto& g_port = m_npuw_model->inputs()[param_idx];
-        const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
-        const auto& s_port = subr->get_inputs()[sub_in_idx];
-        LOG_DEBUG("Processing " << g_port << " -> " << s_port << "...");
-        LOG_BLOCK();
-        if (!is_spatial_param(sub_in_idx)) {
-            // Input parameter is non-spatial, do normal handling
-            if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) {
-                LOG_DEBUG("Will be copied");
-                copy_list.emplace_back(g_tnsr, s_port);
-            } else {
-                LOG_DEBUG("Will be set");
-                subr->set_tensor(s_port, g_tnsr);
-            }
-        } else {
-            // Register for future use
-            m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr;
-        }
+        bind_global_params(idx, m_subrequests[real_idx]);
     }
-
-    LOG_DEBUG("Running copy...");
-    ov::parallel_for(copy_list.size(), [&](std::size_t idx) {
-        auto& it = copy_list[idx];
-        ov::SoPtr<ov::ITensor> dst = subr->get_tensor(it.second);
-        it.first->copy_to(dst._ptr);
-    });
-
-    // Run host-side gather, if required
-    if (comp_model_desc.host_gather.dst_idx != -1) {
-        const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx];
-        const auto gather = subr->get_tensor(gport);
-
-        const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
-        const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
-        const auto lookup = subr->get_tensor(lport);
-        ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather);
-    }
-
-    LOG_DEBUG("Done");
 }
 
 void ov::npuw::JustInferRequest::bind_global_results(std::size_t idx) {
-    LOG_DEBUG("Binding results for Subgraph[" << idx << "]");
-    LOG_BLOCK();
-
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     if (comp_model_desc.replaced_by) {
         // Don't do here - function call will take the right tensor
         // itself. Note it may be implemented more efficently than now
         // (and in some cases, the tensor can be pre-set)
-        LOG_DEBUG("Skipping this too now - function will do it for itself");
+        LOG_DEBUG("Skipping bind_glo - function will do it for itself");
         return;
     }
-
-    const auto& iodesc = m_subrequests_gio.at(idx);
-    for (auto&& it : iodesc.global_results) {
-        std::size_t result_idx{}, sub_out_idx{};
-        std::tie(result_idx, sub_out_idx) = it;
-        const auto& g_port = m_npuw_model->outputs()[result_idx];
-        const auto& s_port = m_subrequests[idx]->get_outputs()[sub_out_idx];
-        m_subrequests[idx]->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
-    }
-
-    LOG_DEBUG("Done");
+    IBaseInferRequest::bind_global_results(idx, m_subrequests[idx]);
 }
 
 void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
@@ -737,88 +577,6 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
     LOG_DEBUG("Done");
 }
 
-void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) {
-    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
-
-    NPUW_ASSERT(comp_model_desc.replaced_by);
-    const auto real_idx = comp_model_desc.replaced_by.value();
-    auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx];
-
-    // Bind extra parameters from the function's closure
-    // First, do easy things & delay heavy stuff
-    std::vector<std::size_t> closure_unpack_required;
-    std::vector<std::size_t> closure_copy_required;
-
-    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
-        auto& closure = comp_model_desc.closure[cidx];
-
-        const auto closure_param_id = comp_model_desc.param_base + cidx;
-
-        if (func_desc.host_gather.dst_idx != -1 &&
-            static_cast<uint64_t>(func_desc.host_gather.dst_idx) == closure_param_id) {
-            // No need to set/copy the host_gather's closure tensor int
-            // the subrequest - it is just a dummy. host_gather writes
-            // to the right buffer directly.
-            continue;
-        }
-
-        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
-        if (closure.get_element_type() != iport.get_element_type()) {
-            // Remember where the unpack is required
-            closure_unpack_required.push_back(cidx);
-        } else {
-            if (needs_copy(idx, cidx)) {
-                // Remember where copy is requried
-                closure_copy_required.push_back(cidx);
-            } else {
-                // Easy case, just set one to another
-                request->set_tensor(iport, ov::get_tensor_impl(closure));
-            }
-        }
-    }  // for(closure)
-
-    // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
-    ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
-        auto cidx = closure_copy_required[j];
-        auto& closure = comp_model_desc.closure[cidx];
-        const auto closure_param_id = comp_model_desc.param_base + cidx;
-        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
-        auto clparam = request->get_tensor(iport);
-        ov::get_tensor_impl(closure)->copy_to(clparam._ptr);
-    });
-    // }); // ms_to_run
-
-    for (std::size_t j = 0; j != closure_unpack_required.size(); j++) {
-        // NB: No need to protect anything here as containers are all
-        // preallocated and we only access elements under particular (thread
-        // -local) indices.
-        auto cidx = closure_unpack_required[j];
-
-        // FIXME: zerops are stored with absolute indexing, this needs to be aligned
-        auto& closure = comp_model_desc.closure[cidx];
-
-        const auto closure_param_id = comp_model_desc.param_base + cidx;
-        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
-        auto clparam = request->get_tensor(iport);
-
-        if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) {
-            // Unpacking this weight requires scaling with zero points...
-            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
-                                   ov::get_tensor_impl(comp_model_desc.zerops[cidx]),
-                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
-                                   clparam);
-        } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) {
-            // Unpacking this weight requires scaling
-            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
-                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
-                                   clparam);
-        } else {
-            // Unpacking this weight doesn't require scaling
-            ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam);
-        }
-    }
-}
-
 void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     auto real_idx = comp_model_desc.replaced_by.value_or(idx);
@@ -1110,24 +868,6 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
     }  // if (replaced_by)
 }
 
-ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type,
-                                                         const ov::Shape& shape,
-                                                         const std::string& device) {
-    if (device == "CPU" || ov::shape_size(shape) == 0) {
-        return ov::get_tensor_impl(ov::Tensor(type, shape));
-    }
-
-    // Protect access to shared context(s) - at least among infer requests
-    auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
-    auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
-    return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
-}
-
-ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output<const ov::Node>& node,
-                                                         const std::string& device) {
-    return allocMem(node.get_element_type(), node.get_shape(), device);
-}
-
 void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) {
     get_real_subrequest(idx)->set_callback(std::move(cb));
 }
@@ -1140,10 +880,6 @@ void ov::npuw::JustInferRequest::cancel_subrequest(std::size_t idx) {
     m_subrequests[idx]->cancel();
 }
 
-std::size_t ov::npuw::JustInferRequest::total_subrequests() const {
-    return m_subrequests.size();
-}
-
 bool ov::npuw::JustInferRequest::supports_async_pipeline() const {
     return false;
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
index d219f170a8e6bb..a935220b4b8943 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -23,13 +23,6 @@ namespace npuw {
 class CompiledModel;
 class AsyncInferRequest;
 
-using LinkFrom = std::pair<std::size_t /* Subrequest index */
-                           ,
-                           std::size_t /* Subrequest output index */
-                           >;          // FIXME: This is a third, if not fourth, definitiion of such structure
-
-using TensorPtr = ov::SoPtr<ov::ITensor>;
-
 class MemAccessSim {
 public:
     explicit MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
@@ -77,11 +70,7 @@ class JustInferRequest final : public IBaseInferRequest {
 public:
     explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
 
-    // Query APIs
-    std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
-    std::vector<ov::ProfilingInfo> get_profiling_info() const override;
-
-private:
+protected:
     ////////////////////////////////////
     // implement IBaseInferRequest
     void prepare_for_infer() override;
@@ -91,11 +80,11 @@ class JustInferRequest final : public IBaseInferRequest {
     void subscribe_subrequest(std::size_t idx, Completed cb) override;
     void complete_subrequest(std::size_t idx) override;
     void cancel_subrequest(std::size_t idx) override;
-    std::size_t total_subrequests() const override;
     bool supports_async_pipeline() const override;
-
     void update_subrequest_links(std::size_t idx) override;
 
+    TensorPtr alloc_global_out(std::size_t out_idx) override;
+
     ////////////////////////////////////
     // now own API
 
@@ -104,9 +93,9 @@ class JustInferRequest final : public IBaseInferRequest {
 
     void bind_global_parameters(std::size_t idx);
     void bind_global_results(std::size_t idx);
+    using IBaseInferRequest::bind_global_results;
 
     void function_prologue(std::size_t idx);
-    void unpack_closure(std::size_t idx, RqPtr request);
 
     void unsafe_during(std::size_t real_idx, const std::function<void()>& f);
     void unsafe_infer(std::size_t real_idx);
@@ -115,9 +104,6 @@ class JustInferRequest final : public IBaseInferRequest {
     void connect_subrequests();
     void recreate_subrequests(std::size_t idx);
 
-    TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
-    TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
-
     FuncMemMgr m_func_mem_mgr;                       // Owns memory
     std::map<LinkFrom, TensorPtr> m_funcall_result;  // Provides a convenient link
 
@@ -139,18 +125,6 @@ class JustInferRequest final : public IBaseInferRequest {
     // initialized.
     std::vector<FuncallPipeline> m_funcall_pipeline;
 
-    // This structure tracks how every individual subrequest
-    // access the model's top-level (global, public, etc) parameters
-    // and results
-    struct GlobalIO {
-        using map_t = std::map<std::size_t, std::size_t>;
-        map_t global_params;   // param idx -> input idx
-        map_t global_results;  // result idx -> output idx
-    };
-    std::vector<GlobalIO> m_subrequests_gio;
-
-    std::unordered_set<void*> m_input_allocated;
-
     // Represents spatial run-time info
     runtime::spatial::Selector::Ptr m_spatial_selector;
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
new file mode 100644
index 00000000000000..90eb62dcc0a8e3
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
@@ -0,0 +1,140 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "unfold_sync_infer_request.hpp"
+
+#include "compiled_model.hpp"
+#include "logging.hpp"
+#include "openvino/core/parallel.hpp"
+
+ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
+    : ov::npuw::IBaseInferRequest(compiled_model) {
+    // Create infer requests
+    // Preallocate funcall tensors & substitute function call requests
+    for (std::size_t i = 0; i < m_num_submodels; i++) {
+        LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
+        LOG_BLOCK();
+        auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
+
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            // no model & no funcall - optimized out, do nothing
+            LOG_INFO("OPTIMIZED OUT");
+            continue;
+        }
+
+        if (comp_model_desc.replaced_by) {
+            // Pre-allocate output tensors for this function call
+            const auto real_idx = comp_model_desc.replaced_by.value();
+            auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+            if (proto_comp_model_desc.spatial) {
+                NPUW_ASSERT(false && "Spatial is not supported in unfold");
+            }
+        }  // if(replaced_by)
+
+        const auto real_idx = comp_model_desc.replaced_by.value_or(i);
+        auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+        // NB: UnfoldInferRequest is _NOT_ fail-safe! Fail means fail here
+        m_subrequests[i] = proto_comp_model_desc.compiled_model->create_infer_request();
+        m_subrequest_devices[i] = *proto_comp_model_desc.device_it;
+        LOG_INFO("DONE");
+    }  // for(submodels)
+
+    alloc_io();
+
+    LOG_INFO("Connecting subrequests...");
+    LOG_BLOCK();
+    for (const auto& kvp : m_npuw_model->m_submodels_input_to_prev_output) {
+        const auto& subm_idx_to = kvp.first.first;
+        const auto& port_idx_to = kvp.first.second;
+        const auto& subm_idx_from = kvp.second.first;
+        const auto& port_idx_from = kvp.second.second;
+
+        LOG_DEBUG("Subgraph[" << subm_idx_from << "]/" << port_idx_from << " --> "
+                              << "Subgraph[" << subm_idx_to << "]/" << port_idx_to);
+        NPUW_ASSERT(m_subrequests[subm_idx_from]);  // prod request is created
+        NPUW_ASSERT(m_subrequests[subm_idx_to]);    // cons request is created
+        NPUW_ASSERT(m_subrequests[subm_idx_from]._ptr != m_subrequests[subm_idx_to]._ptr);
+
+        const auto& iport = m_subrequests[subm_idx_to]->get_compiled_model()->inputs()[port_idx_to];
+        const auto& oport = m_subrequests[subm_idx_from]->get_compiled_model()->outputs()[port_idx_from];
+        const auto& tensor = m_subrequests[subm_idx_from]->get_tensor(oport);
+        LOG_DEBUG("Set Subgraph[" << subm_idx_to << "]/" << iport << " to Subgraph[" << subm_idx_from << "]/" << oport);
+        m_subrequests[subm_idx_to]->set_tensor(iport, tensor);
+    }  // for(map)
+    LOG_INFO("Done");
+
+    init_gio();
+
+    for (size_t i = 0; i < m_num_submodels; i++) {
+        LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]...");
+        LOG_BLOCK();
+        auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            continue;  // Optimized out
+        }
+        unpack_closure(i, m_subrequests[i]);
+        LOG_VERB("Done");
+    }
+}
+
+bool ov::npuw::UnfoldInferRequest::valid_subrequest(std::size_t idx) const {
+    return m_subrequests.at(idx) != nullptr;
+}
+
+void ov::npuw::UnfoldInferRequest::infer() {
+    const bool do_async = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
+
+    auto prepare = [&](std::size_t idx) {
+        if (idx >= m_subrequests.size()) {
+            return;
+        }
+        bind_global_params(idx, m_subrequests[idx]);
+        bind_global_results(idx, m_subrequests[idx]);
+    };
+    auto wait_and_clear = [](RqPtrs& rqs) {
+        for (auto&& r : rqs) {
+            r->wait();
+        }
+        rqs.clear();
+    };
+
+    if (do_async) {
+        std::size_t past_repl_id = 0u;
+        RqPtrs previous_requests;
+
+        prepare(0);
+        for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
+            auto& subr = m_subrequests[idx];
+            if (!subr) {
+                prepare(idx + 1);
+                continue;
+            }
+            auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+            const auto this_repl_id = comp_model_desc.replaced_by.value_or(idx);
+            if (this_repl_id != past_repl_id) {
+                // For non-repeating blocks, the above value_or returns idx
+                // For repeating blocks, it returns the function group id
+                // If either is not equal to the past_repl_id, make a barrier here
+                wait_and_clear(previous_requests);
+                past_repl_id = this_repl_id;
+            }
+            subr->start_async();
+            previous_requests.push_back(subr);
+            prepare(idx + 1);
+        }
+        wait_and_clear(previous_requests);
+    } else {
+        prepare(0);
+        for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
+            auto& subr = m_subrequests[idx];
+            if (!subr) {
+                prepare(idx + 1);
+                continue;
+            }
+            subr->start_async();
+            prepare(idx + 1);
+            subr->wait();
+        }
+    }  // (async)
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp
new file mode 100644
index 00000000000000..76b67571ec4c40
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <limits>
+#include <map>
+#include <mutex>
+#include <optional>
+#include <vector>
+
+#include "base_sync_infer_request.hpp"
+
+namespace ov {
+namespace npuw {
+
+class UnfoldInferRequest final : public IBaseInferRequest {
+public:
+    explicit UnfoldInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
+
+    ////////////////////////////////////
+    // implement IBaseInferRequest - nether of these are required here
+    // this hierarchy needs revew
+    void prepare_for_infer() override {}
+    bool valid_subrequest(std::size_t idx) const override;
+    void start_subrequest(std::size_t) override {}
+    void run_subrequest_for_success(std::size_t, bool&) override {}
+    void subscribe_subrequest(std::size_t, Completed cb) override {}
+    void complete_subrequest(std::size_t) override {}
+    void cancel_subrequest(std::size_t) override {}
+    bool supports_async_pipeline() const override {
+        return false;
+    }
+    void update_subrequest_links(std::size_t) override {}
+
+private:
+    void infer() override;
+};
+
+}  // namespace npuw
+}  // namespace ov