diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py index d19e372a1..08b924dca 100755 --- a/sw/blas/gemm/data/datagen.py +++ b/sw/blas/gemm/data/datagen.py @@ -122,6 +122,7 @@ def emit_header(**kwargs): # gemmImpl data_str += ["// -- gemmImpl"] data_str += [f"#define USE_METHOD {gemmImpl['method']}"] + data_str += [f"#define USE_C2C_TILES {int(gemmImpl['use_c2c_tiles'])}"] data_str += [f"#define L1_M {gemmImpl['L1_M']}"] data_str += [f"#define L1_N {gemmImpl['L1_N']}"] data_str += [f"#define L1_K {gemmImpl['L1_K']}"] diff --git a/sw/blas/gemm/data/params.hjson b/sw/blas/gemm/data/params.hjson index 46b725ded..e4c33fe2c 100644 --- a/sw/blas/gemm/data/params.hjson +++ b/sw/blas/gemm/data/params.hjson @@ -21,11 +21,12 @@ } gemmImpl: { - method: "baseline", - L1_M: 8, - L1_N: 8, - L1_K: 8, - ta_tile: false, + method: "2dpipe", + use_c2c_tiles: true, + L1_M: 16, + L1_N: 16, + L1_K: 16, + ta_tile: true, tb_tile: false, tc_tile: false, // not implemented expand: 0, diff --git a/sw/blas/gemm/src/dma_xfer_test.h b/sw/blas/gemm/src/dma_xfer_test.h index e55e0426f..acb7d6f03 100644 --- a/sw/blas/gemm/src/dma_xfer_test.h +++ b/sw/blas/gemm/src/dma_xfer_test.h @@ -1,5 +1,11 @@ #include "gemm_decls.h" +/** + * Test DMA effective bandwidth. + * Transfers an array from HBM to TCDM, + * then rotates between cluster TCDMs with C2C communication, + * then stores the result back to HBM. +*/ void dma_xfer_test(const double* A, const uint32_t N, const bool bench) { if (!snrt_is_dm_core()) return; diff --git a/sw/blas/gemm/src/gemm_baseline.h b/sw/blas/gemm/src/gemm_baseline.h deleted file mode 100644 index ae3e297e7..000000000 --- a/sw/blas/gemm/src/gemm_baseline.h +++ /dev/null @@ -1,183 +0,0 @@ -// GEMM implementation for OCCAMY without any horizontal (C2C/G2G) communication - -#pragma once - -#include -#include -#include - -#include "gemm.h" -#include "snrt.h" - -#include "gemm_decls.h" - -/** - * \brief Each cluster performs a GEMM for A, B, C inside each TCDM - */ -void gemm_cluster_kernel(double alpha, double beta, uint32_t M, uint32_t N, - uint32_t K, double* const A, double* const B, - double* const C, int lda, int ldb, int ldc) { - uint32_t p[3], P[3]; - ocrt_thread_idx(p); - ocrt_compute_thread_num(P); - - for (uint32_t i = p[0]; i < M; i += P[0]) { - for (uint32_t j = 0; j < N; j++) { - uint32_t cIdx = i * ldc + j; // C[i][j] - register double c0 = beta * C[cIdx]; - - for (uint32_t k = 0; k < K; k++) { - uint32_t aIdx = i * lda + k; // A[i][k] - uint32_t bIdx = k * ldb + j; // B[k][j] - - c0 += A[aIdx] * B[bIdx]; - } - C[cIdx] = c0; - } - } - snrt_fpu_fence(); -} - -void gemm_oc_baseline(double alpha, double beta, uint32_t m, uint32_t n, - uint32_t k, double* A, double* B, double* C, uint32_t lda, - uint32_t ldb, uint32_t ldc) { - /** - * Problem is double buffered in L1. The buffer that is used is toggled at - * each iteration. The DMA cores are one index step ahead so they load the - * data in advance into the buffer that will be used. - */ - - volatile uint32_t p[3] = {0, 0, 0}; - volatile uint32_t P[3] = {0, 0, 0}; - ocrt_thread_idx(p); - ocrt_compute_thread_num(P); - - // Setup layout for TCDM L1 - // For double buffering l1 is a size 2 array - TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next(); - - bool l1Id_AB = false; - bool l1Id_C = false; - - // Initialize indices - const uint32_t I = m, J = n, K = k; - - const uint32_t PI = P[1], PJ = 1; - const uint32_t pi = p[1] / PJ; - const uint32_t pj = p[1] % PJ; - - int ib, jb, kb; - int ib_prev, jb_prev, kb_prev; - bool ib_dir = false, jb_dir = false, kb_dir = false; - - bool storeC = false; - - // Debug - volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0; - - if (snrt_is_compute_core()) { - snrt_cluster_hw_barrier(); // DMA core is one index ahead - } - - // FOR_EACH(ib, pi, I / L1_M, PI, ib_dir, ib_prev) { - ib_dir = !ib_dir; - const int ib_end_floor = ((I / 8 - pi + PI - 1) / PI) * PI - PI + pi; - const int ib_first = ib_dir ? pi : ib_end_floor; - const int ib_last = ib_dir ? ib_end_floor : pi; - ib = ib_first; - ib_prev = ib; - for (; ib_dir ? ib <= ib_last : ib >= ib_last; - ib = ib_dir ? ib + PI : ib - PI) { - ib_cnt += ib; - // FOR_EACH(jb, pj, J / L1_N, PJ, jb_dir, jb_prev) { - jb_dir = !jb_dir; - const int jb_end_floor = ((J / 8 - pj + PJ - 1) / PJ) * PJ - PJ + pj; - const int jb_first = jb_dir ? pj : jb_end_floor; - const int jb_last = jb_dir ? jb_end_floor : pj; - jb = jb_first; - jb_prev = jb; - for (; jb_dir ? jb <= jb_last : jb >= jb_last; - jb = jb_dir ? jb + PJ : jb - PJ) { - jb_cnt += jb; - - double* const l1_C = l1[l1Id_C].C; - - if (snrt_is_dm_core()) { - dump_ib(ib); - dump_jb(jb); - snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64); - if (ib != ib_first || jb != jb_first) storeC = true; - } - - // FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) { - kb_dir = !kb_dir; - const int kb_end_floor = ((K / L1_K - 0 + 1 - 1) / 1) * 1 - 1 + 0; - const int kb_first = kb_dir ? 0 : kb_end_floor; - const int kb_last = kb_dir ? kb_end_floor : 0; - kb = kb_first; - kb_prev = kb; - for (; kb_dir ? kb <= kb_last : kb >= kb_last; - kb = kb_dir ? kb + 1 : kb - 1) { - kb_cnt += kb; - double* const l1_A = l1[l1Id_AB].A; - double* const l1_B = l1[l1Id_AB].B; - - // load next A, B - if (snrt_is_dm_core()) { - snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, - FP64); - snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, - FP64); - - snrt_dma_wait_all(); - } else { - // solve block already in l1, parallelize inside each - // cluster gemm_cluster_kernel(alpha, beta, L1_M, L1_N, - // L1_K, l1_A, l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC); - - gemm(FP64, 0, true, false, false, L1_M, L1_N, L1_K, alpha, - l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC); - } - - l1Id_AB = !l1Id_AB; // switch buffers - snrt_cluster_hw_barrier(); - - if (snrt_is_dm_core()) { - if (storeC) { - storeC = false; - snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, - jb_prev, L1_M, L1_N, ldc, FP64); - } - } - kb_prev = kb; - } - - l1Id_C = !l1Id_C; // switch buffers - jb_prev = jb; - ib_prev = ib; - } - } - - if (snrt_is_dm_core()) { - snrt_cluster_hw_barrier(); // DMA core is one index ahead - - // store final tile - snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, - ldc, FP64); - snrt_dma_wait_all(); - } - - // Free memory once implemented by snrt - // snrt_l1free(l1); -} - -inline void gemm_oc(precision_t prec, uint32_t expand, uint32_t setup_ssr, - uint32_t transa, uint32_t transb, uint32_t m, uint32_t n, - uint32_t k, double alpha, void* a, uint32_t lda, void* b, - uint32_t ldb, uint32_t beta, void* c, uint32_t ldc) { - // gemm_cluster_kernel(alpha, beta, m, n, k, a, b, c, lda, ldb, ldc); - // snrt_fpu_fence(); - // snrt_cluster_hw_barrier(); - - gemm_oc_baseline(alpha, beta, m, n, k, a, b, c, lda, ldb, ldc); -} diff --git a/sw/blas/gemm/src/gemm_kernel_legacy.h b/sw/blas/gemm/src/gemm_kernel_legacy.h index 9cfd42053..1b74854b5 100644 --- a/sw/blas/gemm/src/gemm_kernel_legacy.h +++ b/sw/blas/gemm/src/gemm_kernel_legacy.h @@ -23,6 +23,33 @@ typedef __fp16 v4f16 __attribute__((vector_size(8))); typedef char v8f8 __attribute__((vector_size(8))); #endif +/** + * \brief Each cluster performs a GEMM for A, B, C inside each TCDM + */ +void gemm_cluster_kernel_baseline(double alpha, double beta, uint32_t M, uint32_t N, + uint32_t K, double* const A, double* const B, + double* const C, int lda, int ldb, int ldc) { + uint32_t p[3], P[3]; + ocrt_thread_idx(p); + ocrt_compute_thread_num(P); + + for (uint32_t i = p[0]; i < M; i += P[0]) { + for (uint32_t j = 0; j < N; j++) { + uint32_t cIdx = i * ldc + j; // C[i][j] + register double c0 = beta * C[cIdx]; + + for (uint32_t k = 0; k < K; k++) { + uint32_t aIdx = i * lda + k; // A[i][k] + uint32_t bIdx = k * ldb + j; // B[k][j] + + c0 += A[aIdx] * B[bIdx]; + } + C[cIdx] = c0; + } + } + snrt_fpu_fence(); +} + void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA, uint32_t ta, double* B, uint32_t ldB, uint32_t tb, double* C, uint32_t ldC, double ALPHA) { diff --git a/sw/blas/gemm/src/gemm_tiling_2dpipe_tpl.h b/sw/blas/gemm/src/gemm_tiling_2dpipe_tpl.h index 6b4540f20..a84a11a13 100644 --- a/sw/blas/gemm/src/gemm_tiling_2dpipe_tpl.h +++ b/sw/blas/gemm/src/gemm_tiling_2dpipe_tpl.h @@ -10,8 +10,6 @@ void SNBLAS_GEMM_TILING(2dpipe, FLOAT_T, IS_DM_CORE) (const SnblasGemmInfo info, const SNBLAS_GEMM_ARGS(FLOAT_T) args, const SnblasGemmImpl impl) { -#define USE_C2C_TILES true - /** * Problem is double buffered in L1. The buffer that is used is toggled at * each iteration. The DMA cores are one index step ahead so they load the diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c index 018726a86..93f6584ba 100644 --- a/sw/blas/gemm/src/main.c +++ b/sw/blas/gemm/src/main.c @@ -21,8 +21,6 @@ NAMED_DUMP(uint32_t, err, 0x7) NAMED_DUMP(uint32_t, bench_iter, 0x7) -#define BIST -#include "data.h" int main() { const bool setup_ssr = true;