Skip to content

Commit

Permalink
NPUW: Dynamic Spatial (openvinotoolkit#27104)
Browse files Browse the repository at this point in the history
### Details:
- Introduce a new SPATIAL pipeline which is a shortcut to
PIPELINE:REG+ISOLATE:COMPUTE+SPATIAL:ON;
- Refactor some code re: spatial regions handling in models and
requests;
- Finally, introduce a dyn dispatch over the spatial range
  - Based on runtime-detected features
  - Can be disabled to measure full range performance

### Tickets:
 - E-143572
  • Loading branch information
dmatveev authored Oct 18, 2024
1 parent 8822480 commit 212be8e
Show file tree
Hide file tree
Showing 14 changed files with 270 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime);
DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime);
DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 64, npuw::partitioning::spatial_nway, CompileTime);
DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, CompileTime);
DEFINE_OPT(NPUW_SPATIAL_DYN, bool, true, npuw::partitioning::spatial_dyn, CompileTime);
DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
Expand Down
12 changes: 10 additions & 2 deletions src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ namespace online {
* @brief
* Type: std::string.
* Specify which partitioning pipeline to run.
* Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE".
* Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE", "SPATIAL".
* Default value: "REG".
*/
static constexpr ov::Property<std::string> pipeline{"NPUW_ONLINE_PIPELINE"};
Expand Down Expand Up @@ -206,10 +206,18 @@ static constexpr ov::Property<bool> spatial{"NPUW_SPATIAL"};
* @brief
* Type: std::size_t.
* Submission size for the spatial execution.
* Default value: 64
* Default value: 128
*/
static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};

/**
* @brief
* Type: boolean.
* Enable dynamic submission for spatial subgraphs. Requires SPATIAL pipeline to be selected.
* Default value: true
*/
static constexpr ov::Property<bool> spatial_dyn{"NPUW_SPATIAL_DYN"};

/**
* @brief
* Type: boolean
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
desc.add<NPUW_PMM>();
desc.add<NPUW_SPATIAL>();
desc.add<NPUW_SPATIAL_NWAY>();
desc.add<NPUW_SPATIAL_DYN>();
desc.add<NPUW_HOST_GATHER>();
desc.add<NPUW_DCOFF_TYPE>();
desc.add<NPUW_DCOFF_SCALE>();
Expand Down
17 changes: 4 additions & 13 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,18 +283,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,

// Fill in the spatial information, if it is present
if (fcn_template._spatial) {
using S = CompiledModelDesc::Spatial;
S s;
s.range = fcn_template._spatial->_range;
s.nway = fcn_template._spatial->_slice;
s.out_dim = fcn_template._spatial->_out_dim;
s.nway_iters = s.range / s.nway;
s.tail_size = s.range % s.nway;
for (auto&& input : fcn_template._spatial->_inputs) {
std::size_t p_idx = fcn_template._model->get_parameter_index(input.param);
s.params.push_back(S::Param{p_idx, input.dim});
}
m_compiled_submodels[id].spatial = std::move(s);
m_compiled_submodels[id].spatial =
compiled::Spatial(fcn_template._spatial.value(), fcn_template._model);
}
LOG_INFO("Subgraph[" << id << "] is a function body for " << subgraph._funcall);
} else {
Expand Down Expand Up @@ -918,7 +908,8 @@ void ov::npuw::CompiledModel::implement_properties() {
BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
BIND(npuw::partitioning::spatial, NPUW_SPATIAL_NWAY),
BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),
BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
Expand Down
18 changes: 3 additions & 15 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2023 Intel Corporation
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand All @@ -13,6 +13,7 @@
#include "openvino/runtime/icompiled_model.hpp"
#include "openvino/runtime/so_ptr.hpp"
#include "partitioning/partitioning.hpp"
#include "spatial.hpp"
#include "weights_bank.hpp"

namespace intel_npu {
Expand Down Expand Up @@ -123,20 +124,7 @@ class CompiledModel : public ov::ICompiledModel {
std::optional<std::size_t> replaced_by;

Subgraph::Gather host_gather;
struct Spatial {
struct Param {
std::size_t idx;
std::size_t dim;
};
std::vector<Param> params;
std::size_t range = 0u;
std::size_t nway = 0u;
std::size_t out_dim = 0u;

std::size_t nway_iters = 0u;
std::size_t tail_size = 0u;
};
std::optional<Spatial> spatial;
std::optional<ov::npuw::compiled::Spatial> spatial;

// FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
// w.r.t. function calls
Expand Down
35 changes: 33 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
// Create infer requests
// Preallocate funcall tensors & substitute function call requests
bool failover_happened = false;
bool has_spatial = false;
for (size_t i = 0; i < m_num_submodels; i++) {
LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
LOG_BLOCK();
Expand All @@ -221,6 +222,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com

// Initialize the spatial IO placeholders, if required
if (proto_comp_model_desc.spatial) {
has_spatial = true;

m_spatial_io[real_idx].inputs.resize(proto_comp_model_desc.param_base);
m_spatial_io[real_idx].input_tails.resize(proto_comp_model_desc.param_base);
m_spatial_io[real_idx].outputs.resize(num_outputs);
Expand Down Expand Up @@ -399,6 +402,24 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
} // for(closure)
LOG_VERB("DONE");
}

// Handle spatial dynamic submission
if (has_spatial) {
if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_SPATIAL_DYN>()) {
LOG_VERB("Finding spatial features...");
LOG_BLOCK();
m_spatial_selector = runtime::spatial::AttentionMask::find(*this);
if (!m_spatial_selector) {
LOG_WARN("Spatial capability is enabled, but no run-time features were found.");
// Fallback selector to ALL
m_spatial_selector.reset(new runtime::spatial::All());
}
} else {
// Just force selector to ALL
m_spatial_selector.reset(new runtime::spatial::All());
}
LOG_VERB("Done");
}
}

void ov::npuw::JustInferRequest::connect_subrequests() {
Expand Down Expand Up @@ -506,6 +527,11 @@ void ov::npuw::JustInferRequest::prepare_for_infer() {
LOG_DEBUG("Pre-initializing weights for subgraph[" << id << "]");
unpack_closure(id, m_subrequests[id]);
}

// Adjust spatial input range, if supported
if (m_spatial_selector) {
m_spatial_selector->prepare();
}
LOG_DEBUG("Done");
}

Expand Down Expand Up @@ -915,6 +941,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
// must be prepared in the m_spatial_io at this point
const auto& spatial = comp_model_desc.spatial.value();
const auto num_outputs = comp_model_desc.compiled_model->outputs().size();
NPUW_ASSERT(m_spatial_selector);

// Create a sparse vector with full input sizes.
// For the access simplicity, its size is aligned with function's
Expand All @@ -940,6 +967,10 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {

std::size_t offset = 0u;
for (std::size_t i = 0u; i < spatial.nway_iters; i++, offset += spatial.nway) {
if (!m_spatial_selector->need_submit(offset, spatial.nway)) {
continue;
}

// Collect spatial inputs for this offset
for (auto&& param : spatial.params) {
const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx];
Expand All @@ -963,7 +994,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
} // for(full_nway_times)

// Now process the tail, if required
if (spatial.tail_size) {
if (spatial.tail_size && m_spatial_selector->need_submit(offset, spatial.tail_size)) {
// Copy the sub-ranges to spatial inputs
// NOTE: tails buffers are read from/written to at 0th offset!
for (auto&& param : spatial.params) {
Expand Down Expand Up @@ -1085,7 +1116,7 @@ ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type
return ov::get_tensor_impl(ov::Tensor(type, shape));
}

std::lock_guard<std::mutex> guard(m_alloc_mutex);
// Protect access to shared context(s) - at least among infer requests
auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "openvino/runtime/iremote_context.hpp"
#include "openvino/runtime/make_tensor.hpp"
#include "openvino/runtime/tensor.hpp"
#include "spatial.hpp"

namespace ov {
namespace npuw {
Expand Down Expand Up @@ -148,8 +149,10 @@ class JustInferRequest final : public IBaseInferRequest {
};
std::vector<GlobalIO> m_subrequests_gio;

std::mutex m_alloc_mutex;
std::unordered_set<void*> m_input_allocated;

// Represents spatial run-time info
runtime::spatial::Selector::Ptr m_spatial_selector;
};

} // namespace npuw
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,12 +267,13 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
// Interface to get online partitioning from the model
class Compiler {
enum class Pipeline {
NONE, // Partitioning will consist of a single group with all the Ops
INIT, // Initialize only. The hardest mode, every group has just 1 layer inside
JUST, // "justParitioning" - combination of LHF + Remnants
REP, // Repeated blocks pipeline - combination of repeatedBlocks and Remnants
REG, // Regularized repeated blocks pipeline -same as REP, but with some strong hints first
COMPUTE // Separates non-foldable compute subgraphs from the model based on predefined rules + REP
NONE, // Partitioning will consist of a single group with all the Ops
INIT, // Initialize only. The hardest mode, every group has just 1 layer inside
JUST, // "justParitioning" - combination of LHF + Remnants
REP, // Repeated blocks pipeline - combination of repeatedBlocks and Remnants
REG, // Regularized repeated blocks pipeline - same as REP, but with some strong hints first
COMPUTE, // Separates non-foldable compute subgraphs from the model based on predefined rules + REP
SPATIAL // Similar to COMPUTE but allows folding
};

template <class C>
Expand All @@ -299,6 +300,8 @@ class Compiler {
return Pipeline::REG;
} else if (pipeline_opt == "COMPUTE") {
return Pipeline::COMPUTE;
} else if (pipeline_opt == "SPATIAL") {
return Pipeline::SPATIAL;
} else {
LOG_WARN("Unknown partitioning compiler pipeline " << pipeline_opt << ", switching to REP");
return Pipeline::REP;
Expand Down Expand Up @@ -428,6 +431,16 @@ class Compiler {
m_snapshot->setCtx(ctx);
rep();
break;
case Pipeline::SPATIAL:
warn_unused<::intel_npu::NPUW_ONLINE_ISOLATE>();
m_cfg.update(::intel_npu::Config::ConfigMap{{std::string(::intel_npu::NPUW_SPATIAL::key()), "YES"}});

// Manually set predefined isolates and nofolds then do rep() pipeline
// FIXME: initialize via a dedicated function instead of parsing
ctx.isolates = detail::getIsolates(detail::ISOL_PRESETS.at("COMPUTE"));
m_snapshot->setCtx(ctx);
rep();
break;
}

LOG_DEBUG("Online partitioning: group sizes after compilation:");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1605,7 +1605,7 @@ void Partitioner::identifySpatialRange(ov::npuw::Function& f) {
const auto& f_params = f._model->get_parameters();
NPUW_ASSERT(f_params.size() > 0);

using S = ov::npuw::Function::Spatial;
using S = ov::npuw::function::Spatial;
S spatial;
spatial._range = f_result_0_shape[1];
spatial._out_dim = 1; // the only case we're looking into now
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <vector>

#include "../lazy_tensor.hpp"
#include "../spatial.hpp"
#include "intel_npu/al/config/config.hpp"
#include "openvino/openvino.hpp"

Expand Down Expand Up @@ -70,20 +71,7 @@ struct Function {
// NOTE: it seems it is required only for `matchRepeatedSubgraphs()'
std::map<std::pair<std::string, std::size_t>, std::size_t> _param_mapping;

// Spatial information. So far assume spatial execution in 1 dimension only
struct Spatial {
using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
struct Param {
PPtr param;
std::size_t dim;
};
std::size_t _range = 0u; // Range over which spatial execution is organized, e.g. 1024
std::size_t _slice = 0u; // A submission size for a single execution, e.g. 128
std::size_t _out_dim = 0u; // Assume it is the same dim for all Results
std::vector<Param> _inputs;
};
using SpatialOpt = std::optional<Spatial>;
SpatialOpt _spatial;
std::optional<ov::npuw::function::Spatial> _spatial;
};

struct Group {
Expand Down
44 changes: 44 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/spatial.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "spatial.hpp"

#include "util.hpp"

ov::npuw::runtime::spatial::AttentionMask::AttentionMask(std::size_t param_idx, const ov::ISyncInferRequest& rq)
: m_attn_mask_param_idx(param_idx),
m_rq(rq) {}

ov::npuw::runtime::spatial::Selector::Ptr ov::npuw::runtime::spatial::AttentionMask::find(
const ov::ISyncInferRequest& rq) {
auto is_attn_mask = [](const ov::Output<const ov::Node>& p) {
const auto shape = p.get_shape();
return p.get_node()->get_friendly_name() == "attention_mask" &&
(shape.size() == 1 || (shape.size() == 2 && shape[0] == 1));
};

const auto& inputs = rq.get_inputs();
auto attn_mask_iter = std::find_if(inputs.begin(), inputs.end(), is_attn_mask);
if (attn_mask_iter != inputs.end()) {
const auto param_idx = std::distance(inputs.begin(), attn_mask_iter);
return Selector::Ptr{new AttentionMask(param_idx, rq)};
}
return Selector::Ptr{};
}

void ov::npuw::runtime::spatial::AttentionMask::prepare() {
// Find the current valid range for this attention mask
// Here we have the following (very strong) assumption:
// The attention mask is dense (that is, has zero or one continuous interest region)
const auto& iport = m_rq.get_compiled_model()->inputs()[m_attn_mask_param_idx];
std::tie(m_valid_range_begin, m_valid_range_end) = ov::npuw::util::validMaskRange(m_rq.get_tensor(iport));
}

bool ov::npuw::runtime::spatial::AttentionMask::need_submit(std::size_t offset, std::size_t len) const {
// We don't submit this request if
// - it is completely below the valid range
// - it is completely above the valid range
// in all other cases, we do
return !(offset + len < m_valid_range_begin || offset >= m_valid_range_end);
}
Loading

0 comments on commit 212be8e

Please sign in to comment.