[STFT][CPU] Improve performance of STFT for CPU by reusage RDFT jit E…

…xecutor (openvinotoolkit#26967) ### Details: - Improve performance of STFT for CPU plugin by reusage RDFT jit Executor - Use parallel loops in stft - No changes in the logic of the existing RDFT executor, RDFTExecutor::build function has been added to keep the RDFT details hidden in cpp as is. - Perf numbers collected within the ticket ### Tickets: - 156115 --------- Co-authored-by: Michal Lukaszewski <[email protected]>
praasz · Dec 19, 2024 · 13d60b1 · 13d60b1
1 parent e8f906e
commit 13d60b1
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 27 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/rdft.cpp b/src/plugins/intel_cpu/src/nodes/rdft.cpp
@@ -838,17 +838,20 @@ struct RDFTJitExecutor : public RDFTExecutor {
             rdftKernel.reset(new jit_dft_kernel_f32<cpu::x64::avx512_core>(isInverse, rdftType));
             dftKernel.reset(new jit_dft_kernel_f32<cpu::x64::avx512_core>(isInverse, complex_to_complex));
             vlen = cpu_isa_traits<cpu::x64::avx512_core>::vlen;
-            primDesc->setImplementationType(jit_avx512);
+            if (primDesc)
+                primDesc->setImplementationType(jit_avx512);
         } else if (mayiuse(cpu::x64::avx2)) {
             rdftKernel.reset(new jit_dft_kernel_f32<cpu::x64::avx2>(isInverse, rdftType));
             dftKernel.reset(new jit_dft_kernel_f32<cpu::x64::avx2>(isInverse, complex_to_complex));
             vlen = cpu_isa_traits<cpu::x64::avx2>::vlen;
-            primDesc->setImplementationType(jit_avx2);
+            if (primDesc)
+                primDesc->setImplementationType(jit_avx2);
         } else if (mayiuse(cpu::x64::sse41)) {
             rdftKernel.reset(new jit_dft_kernel_f32<cpu::x64::sse41>(isInverse, rdftType));
             dftKernel.reset(new jit_dft_kernel_f32<cpu::x64::sse41>(isInverse, complex_to_complex));
             vlen = cpu_isa_traits<cpu::x64::sse41>::vlen;
-            primDesc->setImplementationType(jit_sse42);
+            if (primDesc)
+                primDesc->setImplementationType(jit_sse42);
         } else {
             OPENVINO_THROW("Can't create RDFT kernel");
         }
@@ -1075,22 +1078,6 @@ struct RDFTRefExecutor : public RDFTExecutor {
     }
 };
 
-struct RDFTKey {
-    bool isInverse;
-
-    size_t hash() const {
-        using namespace dnnl::impl::primitive_hashing;
-
-        size_t seed = 0;
-        seed = hash_combine(seed, isInverse);
-        return seed;
-    }
-
-    bool operator==(const RDFTKey& rhs) const {
-        return isInverse == rhs.isInverse;
-    }
-};
-
 void RDFT::createPrimitive() {
     RDFTKey key{};
     key.isInverse = inverse;
@@ -1115,6 +1102,22 @@ void RDFT::createPrimitive() {
 
     Node::createPrimitive();
 }
+
+std::shared_ptr<RDFTExecutor> RDFTExecutor::build(bool inverse, NodeDesc* primDesc) {
+    std::shared_ptr<RDFTExecutor> executor;
+#if defined(OPENVINO_ARCH_X86_64)
+    using namespace dnnl::impl;
+    using namespace dnnl::impl::cpu::x64;
+    if (mayiuse(cpu::x64::sse41)) {
+        executor = std::make_shared<RDFTJitExecutor>(inverse, primDesc);
+        return executor;
+    }
+#endif
+    executor = std::make_shared<RDFTRefExecutor>(inverse);
+    primDesc->setImplementationType(ref_any);
+    return executor;
+}
+
 }  // namespace node
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/rdft.h b/src/plugins/intel_cpu/src/nodes/rdft.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "common/primitive_hashing_utils.hpp"
 #include "kernels/x64/rdft_kernel.hpp"
 #include "node.h"
 
@@ -30,6 +31,8 @@ struct RDFTExecutor {
                                                      const std::vector<size_t>& outputShape,
                                                      const std::vector<int>& axes);
 
+    static std::shared_ptr<RDFTExecutor> build(bool inverse, NodeDesc* primDesc = nullptr);
+
 protected:
     bool isInverse;
 
@@ -125,6 +128,20 @@ class RDFT : public Node {
     bool isSignalSizesConstant = false;
 };
 
+struct RDFTKey {
+    bool isInverse;
+
+    size_t hash() const {
+        size_t seed = 0;
+        seed = dnnl::impl::hash_combine(seed, isInverse);
+        return seed;
+    }
+
+    bool operator==(const RDFTKey& rhs) const {
+        return isInverse == rhs.isInverse;
+    }
+};
+
 }  // namespace node
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/stft.cpp b/src/plugins/intel_cpu/src/nodes/stft.cpp
@@ -4,6 +4,10 @@
 
 #include "stft.h"
 
+#include "cpu/x64/cpu_isa_traits.hpp"
+#include "cpu/x64/jit_generator.hpp"
+#include "nodes/common/cpu_memcpy.h"
+#include "openvino/core/parallel.hpp"
 #include "openvino/core/type.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/stft.hpp"
@@ -73,15 +77,95 @@ bool STFT::created() const {
     return getType() == Type::STFT;
 }
 
+namespace {
+static void transpose_out4d(const uint8_t* in,
+                            uint8_t* out,
+                            const VectorDims& in_shape,
+                            const VectorDims& out_shape,
+                            size_t elem_size) {
+    const std::vector<size_t> axes_order{0, 2, 1, 3};
+    parallel_for3d(out_shape[0],
+                   out_shape[1],
+                   out_shape[2],
+                   [in, out, axes_order, &in_shape, &out_shape, elem_size](size_t i, size_t j, size_t k) {
+                       size_t in_indexes[3];
+                       in_indexes[axes_order[0]] = i;
+                       in_indexes[axes_order[1]] = j;
+                       in_indexes[axes_order[2]] = k;
+                       size_t in_off =
+                           ((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3];
+                       size_t out_off = ((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3];
+                       cpu_memcpy(out + out_off * elem_size, in + in_off * elem_size, out_shape[3] * elem_size);
+                   });
+}
+}  // namespace
+
 void STFT::execute(dnnl::stream strm) {
-    ov::reference::stft(getSrcDataAtPortAs<const float>(DATA_IDX),
-                        getSrcDataAtPortAs<const float>(WINDOW_IDX),
-                        getDstDataAtPortAs<float>(0),
-                        ov::Shape{getSrcMemoryAtPort(DATA_IDX)->getStaticDims()},
-                        ov::Shape{getSrcMemoryAtPort(WINDOW_IDX)->getStaticDims()},
-                        (getSrcDataAtPortAs<const int32_t>(FRAME_SIZE_IDX))[0],
-                        (getSrcDataAtPortAs<const int32_t>(FRAME_STEP_IDX))[0],
-                        m_transpose_frames);
+    const float* signal = getSrcDataAtPortAs<const float>(DATA_IDX);
+    const float* window = getSrcDataAtPortAs<const float>(WINDOW_IDX);
+    float* rdft_result = getDstDataAtPortAs<float>(0);
+    const VectorDims& signal_shape = getSrcMemoryAtPort(DATA_IDX)->getStaticDims();
+    const VectorDims& window_shape = getSrcMemoryAtPort(WINDOW_IDX)->getStaticDims();
+    const int64_t frame_size = (getSrcDataAtPortAs<const int32_t>(FRAME_SIZE_IDX))[0];
+    const int64_t frame_step = (getSrcDataAtPortAs<const int32_t>(FRAME_STEP_IDX))[0];
+
+    const auto is_signal_1D = signal_shape.size() == 1;
+    const size_t batch_size = is_signal_1D ? 1 : signal_shape[0];
+    const size_t signal_axis = is_signal_1D ? 0 : 1;
+    const auto signal_length = signal_shape[signal_axis];
+    const auto num_frames = static_cast<size_t>((signal_length - frame_size) / frame_step) + 1;
+    const auto frame_size_dim = static_cast<size_t>(frame_size);
+    const auto fft_out_shape = VectorDims{static_cast<size_t>((frame_size_dim / 2) + 1), 2};
+    const auto fft_out_shape_size = shape_size(fft_out_shape);
+
+    const auto window_length = window_shape[0] < frame_size_dim ? window_shape[0] : frame_size_dim;
+    std::vector<float> pad_window(frame_size, 0);
+    cpu_parallel_memcpy(pad_window.data() + (frame_size_dim - window_length) / 2,
+                        window,
+                        sizeof(float) * window_shape[0]);
+
+    float* dst = rdft_result;
+    const auto stft_shape = VectorDims{batch_size, num_frames, fft_out_shape[0], fft_out_shape[1]};
+    if (m_transpose_frames) {  // Store intermediate results
+        MemoryPtr dst_mem =
+            getScratchPadMem(std::make_shared<CpuBlockedMemoryDesc>(ov::element::f32, Shape{stft_shape}));
+        dst = dst_mem->getDataAs<float>();
+    }
+
+    parallel_for2d(batch_size, num_frames, [&](size_t batch, size_t frame_idx) {
+        size_t batch_in_start = batch * signal_length;
+        size_t batch_frames_out = batch * num_frames;
+
+        const auto frame_start = batch_in_start + frame_idx * frame_step;
+        const auto frame_end = frame_start + frame_size;
+        std::vector<float> signal_slice(signal + frame_start, signal + frame_end);
+        std::transform(signal_slice.begin(),
+                       signal_slice.end(),
+                       pad_window.begin(),
+                       signal_slice.begin(),
+                       std::multiplies<float>());
+
+        const auto result_idx = (batch_frames_out + frame_idx) * fft_out_shape_size;
+        auto twiddles = rdft_executor->generateTwiddles({static_cast<int>(signal_slice.size())}, fft_out_shape, {0});
+        rdft_executor->execute(signal_slice.data(),
+                               dst + result_idx,
+                               twiddles,
+                               1,
+                               {0},
+                               {static_cast<int>(frame_size)},
+                               {frame_size_dim},
+                               fft_out_shape,
+                               {1},
+                               {2, 1});
+    });
+    if (m_transpose_frames) {
+        const auto stft_transp_out_shape = VectorDims{batch_size, fft_out_shape[0], num_frames, fft_out_shape[1]};
+        transpose_out4d(reinterpret_cast<const uint8_t*>(dst),
+                        reinterpret_cast<uint8_t*>(rdft_result),
+                        stft_shape,
+                        stft_transp_out_shape,
+                        sizeof(float));
+    }
 }
 
 void STFT::executeDynamicImpl(dnnl::stream strm) {
@@ -92,6 +176,20 @@ bool STFT::needShapeInfer() const {
     return !(m_is_frame_size_const && m_is_frame_step_const) || Node::needShapeInfer();
 }
 
+void STFT::createPrimitive() {
+    RDFTKey key{};
+    key.isInverse = false;
+    auto buildExecutor = [&](const RDFTKey& key) -> std::shared_ptr<RDFTExecutor> {
+        return RDFTExecutor::build(key.isInverse, getSelectedPrimitiveDescriptor());
+    };
+
+    auto cache = context->getParamsCache();
+    auto result = cache->getOrCreate(key, buildExecutor);
+    rdft_executor = result.first;
+
+    Node::createPrimitive();
+}
+
 }  // namespace node
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/stft.h b/src/plugins/intel_cpu/src/nodes/stft.h
@@ -7,6 +7,7 @@
 #include <string>
 
 #include "node.h"
+#include "rdft.h"
 
 namespace ov {
 namespace intel_cpu {
@@ -21,6 +22,7 @@ class STFT : public Node {
     bool created() const override;
     static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
     bool needPrepareParams() const override;
+    void createPrimitive() override;
 
     void execute(dnnl::stream strm) override;
     void executeDynamicImpl(dnnl::stream strm) override;
@@ -35,6 +37,8 @@ class STFT : public Node {
     /// STFT params
     bool m_transpose_frames = false;
 
+    // RDFT executor
+    std::shared_ptr<RDFTExecutor> rdft_executor = nullptr;
     bool m_is_frame_size_const = false;
     bool m_is_frame_step_const = false;