Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU] Support dynamic activation sparsity #27974

Open
wants to merge 40 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
f6070c8
add ActivationSparsityFusion
usstq Oct 29, 2024
6f48564
add activation sparse fc kernel
usstq Nov 14, 2024
c2a02ab
update i8 impl
usstq Dec 3, 2024
6057da2
add i4 impl
usstq Dec 4, 2024
f581ec8
fix int4 first-token
usstq Dec 5, 2024
6bc455f
add avx general intrinsic wrapper
usstq Dec 5, 2024
8ce0b42
add simd abstract & AVX512 support
usstq Dec 7, 2024
dad99c1
fix AVX512 bugs
usstq Dec 9, 2024
99b05bc
add reuse_B gemm kernel
usstq Dec 9, 2024
0b58da6
fix bug in MM_ComputeBounded_reuseB_i8
usstq Dec 10, 2024
a75aa9f
fix bug in reduce_outputs
usstq Dec 10, 2024
c131998
fix bug in avx512 int4
usstq Dec 13, 2024
61f7852
support sym
usstq Dec 13, 2024
599bb91
support f16 weights
usstq Dec 13, 2024
2e6a8ca
support f32 activation only
usstq Dec 13, 2024
e700cf2
replace intrinsic with jit
usstq Dec 17, 2024
d75a323
add i8 in jit_compile_accumulate_weight
usstq Dec 17, 2024
a4ac58f
i8 is fully-jitted
usstq Dec 17, 2024
cd84680
remove cross-compile
usstq Dec 18, 2024
b9a5265
simplify kernel interface
usstq Dec 18, 2024
ea77a20
add if_ & while_
usstq Dec 18, 2024
69e4467
add simd_jit header & do_while_
usstq Dec 19, 2024
7182fce
fix bugs
usstq Dec 19, 2024
3a4461b
clean-up
usstq Dec 19, 2024
8b1cd3d
Merge remote-tracking branch 'origin/master' into dynsparse
usstq Dec 19, 2024
f74b44f
add test case
usstq Dec 19, 2024
e745f17
fix CI issues
usstq Dec 19, 2024
fa98141
fix CI issue2
usstq Dec 19, 2024
dd68cb9
fix CI issue3
usstq Dec 19, 2024
2071c60
fix CI issue 4
usstq Dec 20, 2024
354e4de
fix CI issue 5
usstq Dec 20, 2024
a941617
fix test cases
usstq Dec 26, 2024
efc7c57
add boolean support to jit sreg expression
usstq Dec 27, 2024
ef6a6c1
fix review comment
usstq Dec 28, 2024
2f714a3
refactor executor
usstq Dec 30, 2024
36ff3d0
refactor
usstq Dec 30, 2024
31d77f1
fix code style
usstq Dec 30, 2024
6cc4e93
fix code style
usstq Jan 3, 2025
4f55060
fix CI
usstq Jan 3, 2025
19b53c4
fix CI
usstq Jan 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/plugins/intel_cpu/src/cpu_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,8 @@ static const TypeToNameMap& get_type_to_name_tbl() {
{"QKVProjection", Type::QKVProjection},
{"RMS", Type::RMS},
{"SearchSorted", Type::SearchSorted},
{"LoraSubgraph", Type::LoRA}};
{"LoraSubgraph", Type::LoRA},
{"ActSparseFC", Type::ActSparseFC}};
return type_to_name_tbl;
}

Expand Down Expand Up @@ -394,6 +395,7 @@ std::string NameFromType(const Type type) {
CASE(RMS);
CASE(SearchSorted);
CASE(LoRA);
CASE(ActSparseFC);
CASE(Unknown);
}
#undef CASE
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_cpu/src/cpu_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ enum class Type {
QKVProjection,
RMS,
SearchSorted,
LoRA
LoRA,
ActSparseFC,
};

enum class Algorithm {
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "transformations/cpu_opset/common/op/power_static.hpp"
#include "transformations/cpu_opset/common/op/sdpa.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"
#include "transformations/cpu_opset/x64/op/act_sparse_fc.hpp"
#include "transformations/cpu_opset/x64/op/interaction.hpp"
#include "transformations/cpu_opset/x64/op/llm_mlp.hpp"
#include "transformations/cpu_opset/x64/op/mha.hpp"
Expand Down Expand Up @@ -94,6 +95,7 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
OP_EXTENSION_X64(ov::intel_cpu::InteractionNode) \
OP_EXTENSION_X64(ov::intel_cpu::LLMMLPNode) \
OP_EXTENSION_X64(ov::intel_cpu::QKVProjectionNode) \
OP_EXTENSION_X64(ov::intel_cpu::ActSparseFCNode) \
OP_EXTENSION_X64(ov::intel_cpu::ScaledDotProductAttentionWithKVCache) \
OP_EXTENSION_X64(ov::intel_cpu::LoadConvertSaturation) \
OP_EXTENSION_X64(ov::intel_cpu::LoadConvertTruncation) \
Expand Down
242 changes: 242 additions & 0 deletions src/plugins/intel_cpu/src/nodes/act_sparse_fc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "act_sparse_fc.h"

#include "common/arbitrary_order_desc_creator.h"
#include "common/bfloat16.hpp"
#include "common/cpu_memcpy.h"
#include "common/primitive_hashing_utils.hpp"
#include "cpu/x64/cpu_isa_traits.hpp"
#include "cpu/x64/jit_generator.hpp"
#include "nodes/reorder.h"
#include "shape_inference/shape_inference_internal_dyn.hpp"
#include "utils/plain_tensor.hpp"

#if defined(OPENVINO_ARCH_X86_64)
# include "kernels/x64/act_sparse_fc_kernel.hpp"
#endif

#include "openvino/core/parallel.hpp"

using namespace dnnl::impl;
using namespace dnnl::impl::utils;

namespace ov {
namespace intel_cpu {
namespace node {

void ActSparseFC::execute(dnnl::stream strm) {
MAYBE_UNUSED(strm);
if (m_executor) {
const auto* input = getSrcDataAtPortAs<float>(0);
const auto* weight = m_weight->getDataAs<uint8_t>();
const auto* zp = m_config.with_zero_point ? m_zp->getDataAs<uint8_t>() : nullptr;
const auto* scales = m_config.is_quantized ? m_scales->getDataAs<float>() : nullptr;
auto* output = getDstDataAtPortAs<float>(0);

const auto& ishape = getSrcMemoryAtPort(0)->getStaticDims();
int M = shape_size(ishape) / ishape[ishape.size() - 1];

(*m_executor)(input, output, M, m_config.ic, m_config.oc, m_config.threshold, 0, weight, scales, zp);
}
}

ActSparseFC::ActSparseFC(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr context)
: Node(op, context, NgraphShapeInferFactory(op)) {
std::string errorMessage;

if (!isSupportedOperation(op, errorMessage)) {
OPENVINO_THROW("CPU: " + errorMessage);
}
const auto node = std::dynamic_pointer_cast<const ActSparseFCNode>(op);
m_config = node->get_config();
}

struct ActSparseFCKey {
bool is_quantized;
bool is_int4;
bool with_zero_point;
int ic_q_group_size;

size_t hash() const {
using namespace dnnl::impl::primitive_hashing;
size_t seed = 0;
seed = hash_combine(seed, is_quantized);
seed = hash_combine(seed, is_int4);
seed = hash_combine(seed, with_zero_point);
seed = hash_combine(seed, ic_q_group_size);
return seed;
}

bool operator==(const ActSparseFCKey& rhs) const {
return is_quantized == rhs.is_quantized && is_int4 == rhs.is_int4 && with_zero_point == rhs.with_zero_point &&
ic_q_group_size == rhs.ic_q_group_size;
}
};

void ActSparseFC::createPrimitive() {
ActSparseFCKey key;
key.is_quantized = m_config.is_quantized;
key.is_int4 = m_config.is_int4;
key.with_zero_point = m_config.with_zero_point;
key.ic_q_group_size = m_config.ic_q_group_size;

auto buildExecutor = [&](const ActSparseFCKey& key) -> std::shared_ptr<ActSparseFcKernel> {
#if defined(OPENVINO_ARCH_X86_64)
return std::make_shared<ActSparseFcKernel>(context->getScratchPad(),
key.is_quantized,
key.is_int4,
key.with_zero_point,
key.ic_q_group_size);
#else
return nullptr;
#endif
};

m_executor = nullptr;
auto cache = context->getParamsCache();
auto result = cache->getOrCreate(key, buildExecutor);
m_executor = result.first;

if (!m_executor)
OPENVINO_THROW("Failed to create executor for node ", getName(), ".");

// reorder weights
const auto& engine = getEngine();

auto create_weight = [&]() {
auto raw_weight_mem = getSrcMemoryAtPort(1);
MemoryPtr weight_mem;
if (m_config.is_int4) {
// weight : [OC, IC/group_size, group_size] => [IC, OC/2, 2]
// each row is further reordered in unit of 16 x i4 in [0,8,1,9,2,a,3,b,4,c,5,d,6,e,7,f] order
weight_mem = std::make_shared<Memory>(engine, raw_weight_mem->getDescPtr());

const auto& dims = raw_weight_mem->getShape().getStaticDims();
OPENVINO_ASSERT(dims.size() == 3);
OPENVINO_ASSERT(dims[0] == static_cast<size_t>(m_config.oc));
OPENVINO_ASSERT(dims[1] == static_cast<size_t>(m_config.ic / m_config.ic_q_group_size));
OPENVINO_ASSERT(dims[2] == static_cast<size_t>(m_config.ic_q_group_size));

auto* src = raw_weight_mem->getDataAs<uint8_t>();
auto* dst = weight_mem->getDataAs<uint8_t>();
m_executor->repack_weights_i4(src, dst, m_config.ic, m_config.oc);
} else {
// raw [OC, IC] layout
// target [IC, OC] layout
ArbitraryOrderDescCreator descCreator({1, 0});
auto dst_mem_desc =
descCreator.createSharedDesc(raw_weight_mem->getPrecision(), raw_weight_mem->getShape());

weight_mem = std::make_shared<Memory>(engine, dst_mem_desc);
node::Reorder::reorderData(*raw_weight_mem, *weight_mem, context->getParamsCache());
}
return weight_mem;
};

auto create_zp_i4 = [&]() {
// [OC, IC/group_size, 1] => [IC/group_size, OC]
auto raw_zp_mem = getSrcMemoryAtPort(3);
auto zp_mem = std::make_shared<Memory>(engine, raw_zp_mem->getDescPtr());

auto* src = raw_zp_mem->getDataAs<uint8_t>();
auto* dst = zp_mem->getDataAs<uint8_t>();

m_executor->repack_weights_i4(src, dst, m_config.ic / m_config.ic_q_group_size, m_config.oc);
return zp_mem;
};

auto create_scales_i4 = [&]() {
// [OC, IC/group_size, 1] => [IC/group_size, OC]
auto raw_scales_mem = getSrcMemoryAtPort(2);
ArbitraryOrderDescCreator descCreator({2, 1, 0});
auto dst_mem_desc = descCreator.createSharedDesc(raw_scales_mem->getPrecision(), raw_scales_mem->getShape());

auto scales_mem = std::make_shared<Memory>(engine, dst_mem_desc);
node::Reorder::reorderData(*raw_scales_mem, *scales_mem, context->getParamsCache());
return scales_mem;
};

if (!m_config.is_int4) {
// int8 is perOC, no need for reorder
if (m_config.is_quantized)
m_scales = getSrcMemoryAtPort(2);
if (m_config.with_zero_point)
m_zp = getSrcMemoryAtPort(3);
}

auto weightCache = context->getWeightsCache();
if (weightCache != nullptr) {
const auto string_hash = getOriginalLayers() + std::to_string(m_config.is_int4);
m_weight = *weightCache->findOrCreate(string_hash + "_weight", create_weight);
if (m_config.is_int4) {
if (m_config.with_zero_point)
m_zp = *weightCache->findOrCreate(string_hash + "_zp_i4", create_zp_i4);
if (m_config.is_quantized)
m_scales = *weightCache->findOrCreate(string_hash + "_scales_i4", create_scales_i4);
}
} else {
m_weight = create_weight();
if (m_config.is_int4) {
if (m_config.with_zero_point)
m_zp = create_zp_i4();
if (m_config.is_quantized)
m_scales = create_scales_i4();
}
}
}

void ActSparseFC::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;

// auto rtPrecision = getOriginalInputPrecisionAtPort(0);
// OPENVINO_ASSERT(rtPrecision == ov::element::f32, "Unexpected rtPrecision:", rtPrecision);
auto rtPrecision = ov::element::f32;

std::vector<PortConfigurator> inPortConfigs;
std::vector<PortConfigurator> outPortConfigs;

inPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getInputShapeAtPort(0), false, -1); // input
inPortConfigs.emplace_back(LayoutType::ncsp,
getOriginalInputPrecisionAtPort(1),
getInputShapeAtPort(1),
false,
-1); // weight
if (m_config.is_quantized) {
inPortConfigs.emplace_back(LayoutType::ncsp, ov::element::f32, getInputShapeAtPort(2), false, -1); // scales
if (m_config.with_zero_point)
inPortConfigs.emplace_back(LayoutType::ncsp,
getOriginalInputPrecisionAtPort(3),
getInputShapeAtPort(3),
false,
-1); // zero-pt
}

outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1);

addSupportedPrimDesc(inPortConfigs, outPortConfigs, impl_desc_type::ref_any);
}

bool ActSparseFC::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
#if defined(OPENVINO_ARCH_X86_64)
try {
const auto node = std::dynamic_pointer_cast<const ActSparseFCNode>(op);
const auto& config = node->get_config();
if ((config.oc % 32) > 0) {
errorMessage = "Unsupported OC size for node " + node->get_friendly_name();
return false;
}
} catch (...) {
return false;
}
return true;
#else
return false;
#endif
}

} // namespace node
} // namespace intel_cpu
} // namespace ov
76 changes: 76 additions & 0 deletions src/plugins/intel_cpu/src/nodes/act_sparse_fc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "node.h"
#include "transformations/cpu_opset/x64/op/act_sparse_fc.hpp"

#if defined(OPENVINO_ARCH_X86_64)
# include "kernels/x64/act_sparse_fc_kernel.hpp"
#else
namespace ov {
namespace intel_cpu {
class ActSparseFcKernel {
public:
// compile time parameters
ActSparseFcKernel(bool is_quantized, bool is_int4, bool with_zero_points, int ic_group_size);

void operator()(const float* input,
float* output,
int M,
int IC,
int OC,
float threshold,
float zero_point,
const void* W,
const float* scales,
const uint8_t* zp) {
OPENVINO_THROW("Unsupported platform.");
}

void repack_weights_i4(uint8_t* src, uint8_t* dst, int IC, int OC) {
OPENVINO_THROW("Unsupported platform.");
}
};
} // namespace intel_cpu
} // namespace ov
#endif

namespace ov {
namespace intel_cpu {
namespace node {

class ActSparseFC : public Node {
public:
ActSparseFC(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr context);

void getSupportedDescriptors() override {}
bool created() const override {
return getType() == Type::ActSparseFC;
}
bool needPrepareParams() const override {
return false; // this is a shape-agnostic kernel
}
void createPrimitive() override;
void executeDynamicImpl(dnnl::stream strm) override {
execute(strm);
}
void initSupportedPrimitiveDescriptors() override;
void execute(dnnl::stream strm) override;
static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;

private:
std::shared_ptr<ov::intel_cpu::ActSparseFcKernel> m_executor;

MemoryPtr m_weight;
MemoryPtr m_zp;
MemoryPtr m_scales;

ActSparseFCNode::Config m_config;
};

} // namespace node
} // namespace intel_cpu
} // namespace ov
Loading
Loading