Skip to content

Commit

Permalink
dnn/transpose: Implement optimized fp64 kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
fischeti committed Nov 7, 2023
1 parent f64fe60 commit f153dbb
Showing 1 changed file with 36 additions and 2 deletions.
38 changes: 36 additions & 2 deletions sw/dnn/transpose/src/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,40 @@ static inline void transposed2d_fp64(double* input, double* output, uint32_t M,
}
}

/**
* @brief Implementation of an optimized FP64 Transpose2D kernel
*
* @param input Pointer to input feature map
* @param output Pointer to output feature map
* @param M First dimension of the matrix
* @param N Second dimension of the matrix
*/
static inline void transposed2d_fp64_opt(double* input, double* output,
uint32_t M, uint32_t N,
uint32_t M_stride) {

const uint32_t ssr_b[2] = {N, M};
const uint32_t ssr0_i[2] = {sizeof(double), N * sizeof(double)};
const uint32_t ssr1_i[2] = {sizeof(double) * M_stride, sizeof(double)};


snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr_b[0], ssr_b[1], ssr0_i[0], ssr0_i[1]);
snrt_ssr_loop_2d(SNRT_SSR_DM1, ssr_b[0], ssr_b[1], ssr1_i[0], ssr1_i[1]);

snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_2D, input);
snrt_ssr_write(SNRT_SSR_DM1, SNRT_SSR_2D, output);
snrt_ssr_enable();

asm volatile(
"frep.o %[n_frep], 1, 0, 0 \n"
"fsgnj.d ft1, ft0, ft0 \n" ::[ n_frep ] "r"(M * N - 1)
: "ft0", "ft1", "ft2");

snrt_ssr_disable();

snrt_fpu_fence();
}

/**
* @brief Implementation of the FP32 Transpose2D kernel
*
Expand Down Expand Up @@ -106,8 +140,8 @@ static inline void transpose2d_layer(transpose2d_layer_t const l) {
l.M / compute_num, l.N, l.M);
break;
case FP64:
transposed2d_fp64(input_offset, output_offset,
l.M / compute_num, l.N, l.M);
transposed2d_fp64_opt(input_offset, output_offset,
l.M / compute_num, l.N, l.M);
break;
default:
break;
Expand Down

0 comments on commit f153dbb

Please sign in to comment.