From f153dbb404136530c47d6bb095753322c2660610 Mon Sep 17 00:00:00 2001 From: Tim Fischer Date: Tue, 7 Nov 2023 17:16:55 +0100 Subject: [PATCH] dnn/transpose: Implement optimized fp64 kernel --- sw/dnn/transpose/src/transpose.h | 38 ++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/sw/dnn/transpose/src/transpose.h b/sw/dnn/transpose/src/transpose.h index a2ebacc56..36ed0cc5b 100644 --- a/sw/dnn/transpose/src/transpose.h +++ b/sw/dnn/transpose/src/transpose.h @@ -45,6 +45,40 @@ static inline void transposed2d_fp64(double* input, double* output, uint32_t M, } } +/** + * @brief Implementation of an optimized FP64 Transpose2D kernel + * + * @param input Pointer to input feature map + * @param output Pointer to output feature map + * @param M First dimension of the matrix + * @param N Second dimension of the matrix + */ +static inline void transposed2d_fp64_opt(double* input, double* output, + uint32_t M, uint32_t N, + uint32_t M_stride) { + + const uint32_t ssr_b[2] = {N, M}; + const uint32_t ssr0_i[2] = {sizeof(double), N * sizeof(double)}; + const uint32_t ssr1_i[2] = {sizeof(double) * M_stride, sizeof(double)}; + + + snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr_b[0], ssr_b[1], ssr0_i[0], ssr0_i[1]); + snrt_ssr_loop_2d(SNRT_SSR_DM1, ssr_b[0], ssr_b[1], ssr1_i[0], ssr1_i[1]); + + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_2D, input); + snrt_ssr_write(SNRT_SSR_DM1, SNRT_SSR_2D, output); + snrt_ssr_enable(); + + asm volatile( + "frep.o %[n_frep], 1, 0, 0 \n" + "fsgnj.d ft1, ft0, ft0 \n" ::[ n_frep ] "r"(M * N - 1) + : "ft0", "ft1", "ft2"); + + snrt_ssr_disable(); + + snrt_fpu_fence(); +} + /** * @brief Implementation of the FP32 Transpose2D kernel * @@ -106,8 +140,8 @@ static inline void transpose2d_layer(transpose2d_layer_t const l) { l.M / compute_num, l.N, l.M); break; case FP64: - transposed2d_fp64(input_offset, output_offset, - l.M / compute_num, l.N, l.M); + transposed2d_fp64_opt(input_offset, output_offset, + l.M / compute_num, l.N, l.M); break; default: break;