Skip to content

Commit

Permalink
[CPU] Avoid storing extra copies of constant inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorDuplensky committed Sep 5, 2024
1 parent 9969f9f commit ad121dd
Show file tree
Hide file tree
Showing 32 changed files with 847 additions and 454 deletions.
34 changes: 1 addition & 33 deletions src/plugins/intel_cpu/src/cpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,41 +27,9 @@ BlockedMemoryDescPtr IMemory::getDescWithType<BlockedMemoryDesc, 0, 0>() const {
}

namespace {
inline void setSubnormalsToZero(float *data, size_t size) {
uint32_t *u32data = reinterpret_cast<uint32_t *>(data);
for (size_t i = 0; i < size; ++i) {
if ((u32data[i] & (0xFF << 23)) == 0) {
u32data[i] = 0;
}
}
}

void transferData(const IMemory& src, const IMemory& dst, bool ftz) {
node::Reorder::reorderData(src, dst);

if (!ftz) {
return;
}
if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() == ov::element::bf16) {
return;
}
size_t offset = 0;
if (dst.getDesc().getType() & MemoryDescType::Dnnl) {
// here we can safely cast to DnnlMemoryDesc
auto dnnl_desc = dst.getDescWithType<DnnlMemoryDesc>();
auto desc = dnnl_desc->getDnnlDesc();
dnnl::impl::memory_desc_wrapper wrapper(desc.get());
offset = wrapper.offset0();
if (wrapper.is_wino_desc() || wrapper.is_rnn_packed_desc()) {
return;
}
}
// actual FTZ
auto* memData = static_cast<float*>(dst.getData());
memData += offset;
setSubnormalsToZero(memData, dst.getSize() / sizeof(float));
node::Reorder::reorderData(src, dst, nullptr, ftz);
}

} // namespace

Memory::Memory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) :
Expand Down
6 changes: 6 additions & 0 deletions src/plugins/intel_cpu/src/cpu_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ class IMemory {
return static_cast<T*>(getData());
}

template <element::Type_t ET>
const typename element_type_traits<ET>::value_type* getDataAs() const {
OPENVINO_ASSERT(ET == getPrecision(), "get_data_ptr() called for incorrect element type.");
return static_cast<const typename element_type_traits<ET>::value_type*>(getData());
}

virtual size_t getSize() const = 0; // in bytes
virtual const Shape& getShape() const = 0;
virtual const VectorDims& getStaticDims() const = 0;
Expand Down
96 changes: 96 additions & 0 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@
#include <limits>
#include <map>
#include <memory>
#include <oneapi/dnnl/dnnl_common.hpp>
#include <string>
#include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

#include "cpu_types.h"
#include "edge.h"
#include "graph_context.h"
#include "graph_dumper.h"
#include "graph_optimizer.h"
#include "infer_request.h"
Expand All @@ -34,12 +37,14 @@
#include "openvino/core/model.hpp"
#include "openvino/core/node.hpp"
#include "openvino/core/type/element_type.hpp"
#include "ov_optional.hpp"
#include "utils/debug_capabilities.h"
#include "utils/general_utils.h"
#include "utils/ngraph_utils.hpp"
#include "utils/node_dumper.h"
#include "utils/verbose.h"
#include "utils/precision_support.h"
#include "utils/clone_original_blob.h"

#include <oneapi/dnnl/dnnl.hpp>
#include "common/primitive_desc_iface.hpp"
Expand Down Expand Up @@ -325,6 +330,8 @@ void Graph::InitGraph(bool optimize) {

ResolveComplexInplaceConflicts();

PreProcessConstantInputs();

SortTopologically();

const bool hasDynNodes = ProcessDynNodes();
Expand Down Expand Up @@ -900,6 +907,95 @@ bool Graph::ProcessDynNodes() {
return containsDynamicNodes;
}

// @todo add ascii diagram
void Graph::PreProcessConstantInputs() {
std::vector<bool> visited(graphNodes.size());

std::function<ov::optional<InputPrepType>(NodePtr, bool, bool)> visitConstantPath;
visitConstantPath = [this, &visitConstantPath, &visited](NodePtr node,
int inPlaceOutPort,
bool oneShotCopyPossible) -> ov::optional<InputPrepType> {
if (visited[node->getExecIndex()])
return {};

visited[node->getExecIndex()] = true;

if (!node->getParentEdges().empty()) {
for (size_t i = 0; i < node->getParentEdges().size(); i++) {
const auto edge = node->getParentEdgeAt(i);
const auto parent = node->getParentEdgeAt(0)->getParent();
// keep track of inplace up by inplace output ports
inPlaceOutPort = inPlaceOutPort == parent->inPlaceOutPort(i) ? edge->parent_port : -1;

return visitConstantPath(parent, inPlaceOutPort, oneShotCopyPossible);
}
}

// that means this is an input node
OPENVINO_ASSERT(node->getType() == Type::Input, "Only Input node is expected to have no parent edges");

auto input = std::dynamic_pointer_cast<node::Input>(node);
MemoryCPtr inputMemory = input->getMemoryPtr();

InputPrepType prepType = requiresPreProcessing(*inputMemory, context, getEngine());

if (prepType == InputPrepType::None) {
return {};
}

const bool isInPlace = inPlaceOutPort >= 0;

if (isInPlace && oneShotCopyPossible && !std::getenv("DISABLE_CLONE_POSTPONE")) {
// clone will be done by a node
return ov::optional<InputPrepType>(prepType);
}

if (!isInPlace && prepType == InputPrepType::PutToNumaLocalCache && !std::getenv("DISABLE_CLONE_POSTPONE")) {
// no need for numa local copy, since current constant path is not inplace, so it will produce a new blob anyway
return {};
}

auto blobKey = [](std::shared_ptr<node::Input> input) {
const auto memory = input->getMemoryPtr();
return input->getName()
+ "_" + std::to_string(memory->getSize() * memory->getPrecision().size())
+ "_" + std::to_string(reinterpret_cast<uint64_t>(memory->getData()));
};

auto create = [&]() {
return cloneBlob(*inputMemory, getEngine(), prepType == InputPrepType::FTZ);
};

auto weightCache = context->getWeightsCache();
auto clone = weightCache ? *weightCache->findOrCreate(blobKey(input), create)
: create();

input->setMemoryPtr(clone);

return {};
};

for (auto& node : graphNodes) {
if (node->isConstant())
continue; // constant nodes will be visited in scope of 'visitConstantPath'

for (size_t i = 0; i < node->getParentEdges().size(); i++) {
const auto parent = node->getParentEdgeAt(i)->getParent();

if (!parent->isConstant())
continue;

bool oneShotCopyPossible = node->canPrepInput(i);
if (auto postponePreProcessing = visitConstantPath(parent, true, oneShotCopyPossible)) {
const auto preprocessing = *postponePreProcessing;
node->prepInput(i, preprocessing);
}
}
}

return;
}

void Graph::PushInputData(const std::size_t& index, const ov::SoPtr<ITensor>& input) {
if (!IsReady()) OPENVINO_THROW("Wrong state. Topology not ready.");
auto input_itr = inputNodesMap.find(index);
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ class Graph {
void ResolveEdgeConflicts();
void ResolveComplexInplaceConflicts();
bool ProcessDynNodes();
void PreProcessConstantInputs();
void GroupParallelNodes();
void Allocate(const std::vector<size_t>& syncNodesInds);
void AllocateWithReuse(const std::vector<size_t>& syncNodesInds);
void CreatePrimitivesAndExecConstants() const;
Expand Down
15 changes: 11 additions & 4 deletions src/plugins/intel_cpu/src/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include <common/primitive_desc.hpp>
#include <common/primitive_desc_iface.hpp>
#include "utils/clone_original_blob.h"

using namespace dnnl;
using namespace openvino;
Expand Down Expand Up @@ -917,7 +918,9 @@ void Node::prepareMemory(dnnl::primitive_desc_iterator& itpd) {
Node::prepareMemory(intDescs);
}

MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc, DnnlMemoryDescPtr srcWeightDesc) {
MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc,
DnnlMemoryDescPtr srcWeightDesc,
InputPrepType preprocessing) {
if (!getParentEdgeAt(1)->getParent()->isConstant())
OPENVINO_THROW("Weight input is not const for node ", getName(), ".");
auto edgeMem = getSrcMemoryAtPort(1);
Expand All @@ -933,10 +936,14 @@ MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc, DnnlMemoryD

auto create = [&] () {
Memory srcMemory{ getEngine(), srcWeightDesc, edgeMem->getData() };
MemoryPtr _ptr = std::make_shared<Memory>(getEngine(), dstWeightDesc);
node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache());
MemoryPtr weightsMem = std::make_shared<Memory>(getEngine(), dstWeightDesc);

return _ptr;
node::Reorder::reorderData(srcMemory,
*weightsMem,
context->getParamsCache(),
preprocessing == InputPrepType::FTZ);

return weightsMem;
};

MemoryPtr ptr;
Expand Down
22 changes: 21 additions & 1 deletion src/plugins/intel_cpu/src/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma once

#include <common/utils.hpp>
#include <cstddef>
#include <oneapi/dnnl/dnnl.hpp>
#include "cpu_memory.h"
#include "cpu_shape.h"
Expand All @@ -24,6 +25,7 @@
#include "utils/debug_capabilities.h"
#include "utils/bit_util.hpp"
#include "utils/debug_capabilities.h"
#include "utils/clone_original_blob.h"

#include "graph_context.h"
#include "nodes/executors/executor.hpp"
Expand Down Expand Up @@ -269,6 +271,22 @@ class Node {
return !hasEmptyInputTensors();
}

/**
* Return true if a node can perform a preprocessing for an input \idx
*/
virtual bool canPrepInput(size_t idx) const {
(void) idx;
return false;
}

/**
* Require a node to perform \type preprocessing for an input \idx
*/
virtual void prepInput(size_t idx, InputPrepType type) {
(void) idx;
(void) type;
}

enum class ConstantType {
Const, // Node is placed in a constant subgraph
NoConst, // Node is placed in a non-constant subgraph
Expand Down Expand Up @@ -740,7 +758,9 @@ class Node {
virtual void prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx);
void prepareMemory(dnnl::primitive_desc_iterator& itpd);

MemoryPtr prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc, DnnlMemoryDescPtr srcWeightDesc = nullptr);
MemoryPtr prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc,
DnnlMemoryDescPtr srcWeightDesc = nullptr,
InputPrepType preprocessing = InputPrepType::None);

bool isDynamic = false;

Expand Down
Loading

0 comments on commit ad121dd

Please sign in to comment.