Skip to content

Commit

Permalink
[STFT][CPU] Improve performance of STFT for CPU by reusage RDFT jit E…
Browse files Browse the repository at this point in the history
…xecutor (openvinotoolkit#26967)

### Details:
- Improve performance of STFT for CPU plugin by reusage RDFT jit
Executor
 - Use parallel loops in stft
- No changes in the logic of the existing RDFT executor,
RDFTExecutor::build function has been added to keep the RDFT details
hidden in cpp as is.
 - Perf numbers collected within the ticket
 
### Tickets:
 - 156115

---------

Co-authored-by: Michal Lukaszewski <[email protected]>
  • Loading branch information
mitruska and mlukasze authored Dec 19, 2024
1 parent e8f906e commit 13d60b1
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 27 deletions.
41 changes: 22 additions & 19 deletions src/plugins/intel_cpu/src/nodes/rdft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -838,17 +838,20 @@ struct RDFTJitExecutor : public RDFTExecutor {
rdftKernel.reset(new jit_dft_kernel_f32<cpu::x64::avx512_core>(isInverse, rdftType));
dftKernel.reset(new jit_dft_kernel_f32<cpu::x64::avx512_core>(isInverse, complex_to_complex));
vlen = cpu_isa_traits<cpu::x64::avx512_core>::vlen;
primDesc->setImplementationType(jit_avx512);
if (primDesc)
primDesc->setImplementationType(jit_avx512);
} else if (mayiuse(cpu::x64::avx2)) {
rdftKernel.reset(new jit_dft_kernel_f32<cpu::x64::avx2>(isInverse, rdftType));
dftKernel.reset(new jit_dft_kernel_f32<cpu::x64::avx2>(isInverse, complex_to_complex));
vlen = cpu_isa_traits<cpu::x64::avx2>::vlen;
primDesc->setImplementationType(jit_avx2);
if (primDesc)
primDesc->setImplementationType(jit_avx2);
} else if (mayiuse(cpu::x64::sse41)) {
rdftKernel.reset(new jit_dft_kernel_f32<cpu::x64::sse41>(isInverse, rdftType));
dftKernel.reset(new jit_dft_kernel_f32<cpu::x64::sse41>(isInverse, complex_to_complex));
vlen = cpu_isa_traits<cpu::x64::sse41>::vlen;
primDesc->setImplementationType(jit_sse42);
if (primDesc)
primDesc->setImplementationType(jit_sse42);
} else {
OPENVINO_THROW("Can't create RDFT kernel");
}
Expand Down Expand Up @@ -1075,22 +1078,6 @@ struct RDFTRefExecutor : public RDFTExecutor {
}
};

struct RDFTKey {
bool isInverse;

size_t hash() const {
using namespace dnnl::impl::primitive_hashing;

size_t seed = 0;
seed = hash_combine(seed, isInverse);
return seed;
}

bool operator==(const RDFTKey& rhs) const {
return isInverse == rhs.isInverse;
}
};

void RDFT::createPrimitive() {
RDFTKey key{};
key.isInverse = inverse;
Expand All @@ -1115,6 +1102,22 @@ void RDFT::createPrimitive() {

Node::createPrimitive();
}

std::shared_ptr<RDFTExecutor> RDFTExecutor::build(bool inverse, NodeDesc* primDesc) {
std::shared_ptr<RDFTExecutor> executor;
#if defined(OPENVINO_ARCH_X86_64)
using namespace dnnl::impl;
using namespace dnnl::impl::cpu::x64;
if (mayiuse(cpu::x64::sse41)) {
executor = std::make_shared<RDFTJitExecutor>(inverse, primDesc);
return executor;
}
#endif
executor = std::make_shared<RDFTRefExecutor>(inverse);
primDesc->setImplementationType(ref_any);
return executor;
}

} // namespace node
} // namespace intel_cpu
} // namespace ov
17 changes: 17 additions & 0 deletions src/plugins/intel_cpu/src/nodes/rdft.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#pragma once

#include "common/primitive_hashing_utils.hpp"
#include "kernels/x64/rdft_kernel.hpp"
#include "node.h"

Expand All @@ -30,6 +31,8 @@ struct RDFTExecutor {
const std::vector<size_t>& outputShape,
const std::vector<int>& axes);

static std::shared_ptr<RDFTExecutor> build(bool inverse, NodeDesc* primDesc = nullptr);

protected:
bool isInverse;

Expand Down Expand Up @@ -125,6 +128,20 @@ class RDFT : public Node {
bool isSignalSizesConstant = false;
};

struct RDFTKey {
bool isInverse;

size_t hash() const {
size_t seed = 0;
seed = dnnl::impl::hash_combine(seed, isInverse);
return seed;
}

bool operator==(const RDFTKey& rhs) const {
return isInverse == rhs.isInverse;
}
};

} // namespace node
} // namespace intel_cpu
} // namespace ov
114 changes: 106 additions & 8 deletions src/plugins/intel_cpu/src/nodes/stft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

#include "stft.h"

#include "cpu/x64/cpu_isa_traits.hpp"
#include "cpu/x64/jit_generator.hpp"
#include "nodes/common/cpu_memcpy.h"
#include "openvino/core/parallel.hpp"
#include "openvino/core/type.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/stft.hpp"
Expand Down Expand Up @@ -73,15 +77,95 @@ bool STFT::created() const {
return getType() == Type::STFT;
}

namespace {
static void transpose_out4d(const uint8_t* in,
uint8_t* out,
const VectorDims& in_shape,
const VectorDims& out_shape,
size_t elem_size) {
const std::vector<size_t> axes_order{0, 2, 1, 3};
parallel_for3d(out_shape[0],
out_shape[1],
out_shape[2],
[in, out, axes_order, &in_shape, &out_shape, elem_size](size_t i, size_t j, size_t k) {
size_t in_indexes[3];
in_indexes[axes_order[0]] = i;
in_indexes[axes_order[1]] = j;
in_indexes[axes_order[2]] = k;
size_t in_off =
((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3];
size_t out_off = ((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3];
cpu_memcpy(out + out_off * elem_size, in + in_off * elem_size, out_shape[3] * elem_size);
});
}
} // namespace

void STFT::execute(dnnl::stream strm) {
ov::reference::stft(getSrcDataAtPortAs<const float>(DATA_IDX),
getSrcDataAtPortAs<const float>(WINDOW_IDX),
getDstDataAtPortAs<float>(0),
ov::Shape{getSrcMemoryAtPort(DATA_IDX)->getStaticDims()},
ov::Shape{getSrcMemoryAtPort(WINDOW_IDX)->getStaticDims()},
(getSrcDataAtPortAs<const int32_t>(FRAME_SIZE_IDX))[0],
(getSrcDataAtPortAs<const int32_t>(FRAME_STEP_IDX))[0],
m_transpose_frames);
const float* signal = getSrcDataAtPortAs<const float>(DATA_IDX);
const float* window = getSrcDataAtPortAs<const float>(WINDOW_IDX);
float* rdft_result = getDstDataAtPortAs<float>(0);
const VectorDims& signal_shape = getSrcMemoryAtPort(DATA_IDX)->getStaticDims();
const VectorDims& window_shape = getSrcMemoryAtPort(WINDOW_IDX)->getStaticDims();
const int64_t frame_size = (getSrcDataAtPortAs<const int32_t>(FRAME_SIZE_IDX))[0];
const int64_t frame_step = (getSrcDataAtPortAs<const int32_t>(FRAME_STEP_IDX))[0];

const auto is_signal_1D = signal_shape.size() == 1;
const size_t batch_size = is_signal_1D ? 1 : signal_shape[0];
const size_t signal_axis = is_signal_1D ? 0 : 1;
const auto signal_length = signal_shape[signal_axis];
const auto num_frames = static_cast<size_t>((signal_length - frame_size) / frame_step) + 1;
const auto frame_size_dim = static_cast<size_t>(frame_size);
const auto fft_out_shape = VectorDims{static_cast<size_t>((frame_size_dim / 2) + 1), 2};
const auto fft_out_shape_size = shape_size(fft_out_shape);

const auto window_length = window_shape[0] < frame_size_dim ? window_shape[0] : frame_size_dim;
std::vector<float> pad_window(frame_size, 0);
cpu_parallel_memcpy(pad_window.data() + (frame_size_dim - window_length) / 2,
window,
sizeof(float) * window_shape[0]);

float* dst = rdft_result;
const auto stft_shape = VectorDims{batch_size, num_frames, fft_out_shape[0], fft_out_shape[1]};
if (m_transpose_frames) { // Store intermediate results
MemoryPtr dst_mem =
getScratchPadMem(std::make_shared<CpuBlockedMemoryDesc>(ov::element::f32, Shape{stft_shape}));
dst = dst_mem->getDataAs<float>();
}

parallel_for2d(batch_size, num_frames, [&](size_t batch, size_t frame_idx) {
size_t batch_in_start = batch * signal_length;
size_t batch_frames_out = batch * num_frames;

const auto frame_start = batch_in_start + frame_idx * frame_step;
const auto frame_end = frame_start + frame_size;
std::vector<float> signal_slice(signal + frame_start, signal + frame_end);
std::transform(signal_slice.begin(),
signal_slice.end(),
pad_window.begin(),
signal_slice.begin(),
std::multiplies<float>());

const auto result_idx = (batch_frames_out + frame_idx) * fft_out_shape_size;
auto twiddles = rdft_executor->generateTwiddles({static_cast<int>(signal_slice.size())}, fft_out_shape, {0});
rdft_executor->execute(signal_slice.data(),
dst + result_idx,
twiddles,
1,
{0},
{static_cast<int>(frame_size)},
{frame_size_dim},
fft_out_shape,
{1},
{2, 1});
});
if (m_transpose_frames) {
const auto stft_transp_out_shape = VectorDims{batch_size, fft_out_shape[0], num_frames, fft_out_shape[1]};
transpose_out4d(reinterpret_cast<const uint8_t*>(dst),
reinterpret_cast<uint8_t*>(rdft_result),
stft_shape,
stft_transp_out_shape,
sizeof(float));
}
}

void STFT::executeDynamicImpl(dnnl::stream strm) {
Expand All @@ -92,6 +176,20 @@ bool STFT::needShapeInfer() const {
return !(m_is_frame_size_const && m_is_frame_step_const) || Node::needShapeInfer();
}

void STFT::createPrimitive() {
RDFTKey key{};
key.isInverse = false;
auto buildExecutor = [&](const RDFTKey& key) -> std::shared_ptr<RDFTExecutor> {
return RDFTExecutor::build(key.isInverse, getSelectedPrimitiveDescriptor());
};

auto cache = context->getParamsCache();
auto result = cache->getOrCreate(key, buildExecutor);
rdft_executor = result.first;

Node::createPrimitive();
}

} // namespace node
} // namespace intel_cpu
} // namespace ov
4 changes: 4 additions & 0 deletions src/plugins/intel_cpu/src/nodes/stft.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <string>

#include "node.h"
#include "rdft.h"

namespace ov {
namespace intel_cpu {
Expand All @@ -21,6 +22,7 @@ class STFT : public Node {
bool created() const override;
static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
bool needPrepareParams() const override;
void createPrimitive() override;

void execute(dnnl::stream strm) override;
void executeDynamicImpl(dnnl::stream strm) override;
Expand All @@ -35,6 +37,8 @@ class STFT : public Node {
/// STFT params
bool m_transpose_frames = false;

// RDFT executor
std::shared_ptr<RDFTExecutor> rdft_executor = nullptr;
bool m_is_frame_size_const = false;
bool m_is_frame_step_const = false;

Expand Down

0 comments on commit 13d60b1

Please sign in to comment.