From 220a908bba292790ee3831437f9e72b54687703a Mon Sep 17 00:00:00 2001 From: Roger Barton Date: Mon, 15 Jan 2024 16:56:23 +0100 Subject: [PATCH] clang-format --- sw/blas/gemm/src/gemm_occamy_1dpipe.h | 96 +++++++++-------- sw/blas/gemm/src/gemm_occamy_2dpipe.h | 107 ++++++++++--------- sw/blas/gemm/src/gemm_occamy_baseline.h | 136 +++++++++++++----------- sw/blas/gemm/src/main.c | 12 +-- 4 files changed, 184 insertions(+), 167 deletions(-) diff --git a/sw/blas/gemm/src/gemm_occamy_1dpipe.h b/sw/blas/gemm/src/gemm_occamy_1dpipe.h index 7becebbd7..56f63cd9d 100644 --- a/sw/blas/gemm/src/gemm_occamy_1dpipe.h +++ b/sw/blas/gemm/src/gemm_occamy_1dpipe.h @@ -2,12 +2,12 @@ #pragma once -#include #include +#include #include -#include "snrt.h" #include "gemm.h" +#include "snrt.h" #include "dump.h" NAMED_DUMP(uint32_t, aIdx, 0x1a) @@ -20,29 +20,32 @@ NAMED_DUMP(double, a, 0xa) NAMED_DUMP(double, b, 0xb) NAMED_DUMP(double, c, 0xc) -#define MIN(a,b) ((a)<(b)?(a):(b)) -#define MAX(a,b) ((a)>(b)?(a):(b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) /** * \brief Implements a reversing loop for an index range * \param begin Beginning of the range * \param end End of the range * \param dir Sets the direction of traversal. True: loop starts at begin. - * \param i_prev Set the previous index to the first index, must update this manually at the end of the loop. - * \details i_end_floor will contain the exact end with the stride, s.t. the reversed loop starts at the correct index. + * \param i_prev Set the previous index to the first index, must update this + * manually at the end of the loop. \details i_end_floor will contain the exact + * end with the stride, s.t. the reversed loop starts at the correct index. */ -#define FOR_EACH(i, begin, end, stride, dir, i_prev) \ - dir = !dir; \ - const int i##_end_floor = ((end - begin + stride - 1) / stride) * stride - stride + begin; \ - const int i##_first = dir ? begin : i##_end_floor; \ - const int i##_last = dir ? i##_end_floor : begin; \ - i = i##_first; \ - i_prev = i; \ - for (; dir ? i <= i##_last : i >= i##_last; i = dir ? i + stride : i - stride) - -#define L1_M 8 //128; -#define L1_N 8 //128; -#define L1_K 8 //128; +#define FOR_EACH(i, begin, end, stride, dir, i_prev) \ + dir = !dir; \ + const int i##_end_floor = \ + ((end - begin + stride - 1) / stride) * stride - stride + begin; \ + const int i##_first = dir ? begin : i##_end_floor; \ + const int i##_last = dir ? i##_end_floor : begin; \ + i = i##_first; \ + i_prev = i; \ + for (; dir ? i <= i##_last : i >= i##_last; \ + i = dir ? i + stride : i - stride) + +#define L1_M 8 // 128; +#define L1_N 8 // 128; +#define L1_K 8 // 128; #define L1_LDA L1_K #define L1_LDB L1_N #define L1_LDC L1_N @@ -60,14 +63,14 @@ NAMED_DUMP(TcdmLayout*, l1, 0x8) TcdmLayout* l1AddrGlobal[SNRT_CLUSTER_NUM] = {0}; -void gemm_oc_opt1d(double alpha, double beta, - uint32_t m, uint32_t n, uint32_t k, - double* A, double* B, double* C, - uint32_t lda, uint32_t ldb, uint32_t ldc) { +void gemm_oc_opt1d(double alpha, double beta, uint32_t m, uint32_t n, + uint32_t k, double* A, double* B, double* C, uint32_t lda, + uint32_t ldb, uint32_t ldc) { /** - * Problem is double buffered in L1. The buffer that is used is toggled at each iteration. - * The DMA cores are one index step ahead so they load the data in advance into the buffer that will be used. - */ + * Problem is double buffered in L1. The buffer that is used is toggled at + * each iteration. The DMA cores are one index step ahead so they load the + * data in advance into the buffer that will be used. + */ volatile uint32_t p[3] = {0, 0, 0}; volatile uint32_t P[3] = {0, 0, 0}; @@ -76,12 +79,11 @@ void gemm_oc_opt1d(double alpha, double beta, // Setup layout for TCDM L1 // For double buffering l1 is a size 2 array - TcdmLayout* l1 = (TcdmLayout*) snrt_l1_next(); + TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next(); TcdmLayout* l1Addr[SNRT_CLUSTER_NUM] = {0}; // Sync l1 pointers between clusters - if (snrt_is_dm_core()) - l1AddrGlobal[p[1]] = l1; + if (snrt_is_dm_core()) l1AddrGlobal[p[1]] = l1; snrt_global_barrier(); if (snrt_is_dm_core()) { memcpy(l1Addr, l1AddrGlobal, SNRT_CLUSTER_NUM * sizeof(*l1Addr)); @@ -91,7 +93,7 @@ void gemm_oc_opt1d(double alpha, double beta, } bool l1Id_AB = false; - bool l1Id_C = false; + bool l1Id_C = false; // Initialize indices const uint32_t I = m, J = n, K = k; @@ -110,12 +112,12 @@ void gemm_oc_opt1d(double alpha, double beta, volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0; if (snrt_is_compute_core()) { - snrt_global_barrier(); // DMA core is one index ahead + snrt_global_barrier(); // DMA core is one index ahead } // -- Compute C2C sources for 2D pipeline - volatile const uint32_t pk = MAX(pi, pj); // pipeline step - int PK = MAX(PI, PJ); // pipeline depth + volatile const uint32_t pk = MAX(pi, pj); // pipeline step + int PK = MAX(PI, PJ); // pipeline depth // Determine C2C source cluster index for each matrix, < 0 is from DRAM int pIdx_A = pj - 1; @@ -123,7 +125,6 @@ void gemm_oc_opt1d(double alpha, double beta, TcdmLayout* const p_srcA = pIdx_A == 0 ? NULL : l1Addr[pIdx_A - 1]; TcdmLayout* const p_srcB = pIdx_B == 0 ? NULL : l1Addr[pIdx_B - 1]; - // Wait for pipeline to be filled for (int pipeline = pk; pipeline > 0; --pipeline) { snrt_global_barrier(); @@ -140,8 +141,7 @@ void gemm_oc_opt1d(double alpha, double beta, dump_ib(ib); dump_jb(jb); snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64); - if (ib != ib_first || jb != jb_first) - storeC = true; + if (ib != ib_first || jb != jb_first) storeC = true; } FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) { @@ -153,13 +153,15 @@ void gemm_oc_opt1d(double alpha, double beta, if (snrt_is_dm_core()) { // TODO: use multicast instead if (p_srcA == NULL) - snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, FP64); + snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, + FP64); else { double* const c2c_A = p_srcA[l1Id_AB].A; snrt_dma_start_1d(l1_A, c2c_A, L1_M * L1_K * FP64); } if (p_srcB == NULL) - snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, FP64); + snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, + FP64); else { double* const c2c_B = p_srcB[l1Id_AB].B; snrt_dma_start_1d(l1_B, c2c_B, L1_K * L1_N * FP64); @@ -167,40 +169,42 @@ void gemm_oc_opt1d(double alpha, double beta, snrt_dma_wait_all(); } else { - // solve block already in l1, parallelize inside each cluster - gemm(FP64, 0, true, false, false, - L1_M, L1_N, L1_K, alpha, + // solve block already in l1, parallelize inside each + // cluster + gemm(FP64, 0, true, false, false, L1_M, L1_N, L1_K, alpha, l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC); } - l1Id_AB = !l1Id_AB; // switch buffers + l1Id_AB = !l1Id_AB; // switch buffers snrt_global_barrier(); if (snrt_is_dm_core()) { if (storeC) { storeC = false; - snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64); + snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, + jb_prev, L1_M, L1_N, ldc, FP64); } } kb_prev = kb; } - l1Id_C = !l1Id_C; // switch buffers + l1Id_C = !l1Id_C; // switch buffers jb_prev = jb; ib_prev = ib; } } if (snrt_is_dm_core()) { - snrt_global_barrier(); // DMA core is one index ahead + snrt_global_barrier(); // DMA core is one index ahead // store final tile - snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64); + snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, + ldc, FP64); snrt_dma_wait_all(); } // Wait for pipeline to be emptied - for (int pipeline = pk; pipeline < PK -1; ++pipeline) { + for (int pipeline = pk; pipeline < PK - 1; ++pipeline) { snrt_global_barrier(); } } diff --git a/sw/blas/gemm/src/gemm_occamy_2dpipe.h b/sw/blas/gemm/src/gemm_occamy_2dpipe.h index a65a4f8e1..054919644 100644 --- a/sw/blas/gemm/src/gemm_occamy_2dpipe.h +++ b/sw/blas/gemm/src/gemm_occamy_2dpipe.h @@ -2,12 +2,12 @@ #pragma once -#include #include +#include #include -#include "snrt.h" #include "gemm.h" +#include "snrt.h" #include "dump.h" NAMED_DUMP(uint32_t, aIdx, 0x1a) @@ -27,21 +27,24 @@ NAMED_DUMP(double, c, 0xc) * \param begin Beginning of the range * \param end End of the range * \param dir Sets the direction of traversal. True: loop starts at begin. - * \param i_prev Set the previous index to the first index, must update this manually at the end of the loop. - * \details i_end_floor will contain the exact end with the stride, s.t. the reversed loop starts at the correct index. + * \param i_prev Set the previous index to the first index, must update this + * manually at the end of the loop. \details i_end_floor will contain the exact + * end with the stride, s.t. the reversed loop starts at the correct index. */ -#define FOR_EACH(i, begin, end, stride, dir, i_prev) \ - dir = !dir; \ - const int i##_end_floor = ((end - begin + stride - 1) / stride) * stride - stride + begin; \ - const int i##_first = dir ? begin : i##_end_floor; \ - const int i##_last = dir ? i##_end_floor : begin; \ - i = i##_first; \ - i_prev = i; \ - for (; dir ? i <= i##_last : i >= i##_last; i = dir ? i + stride : i - stride) - -#define L1_M 8 //128; -#define L1_N 8 //128; -#define L1_K 8 //128; +#define FOR_EACH(i, begin, end, stride, dir, i_prev) \ + dir = !dir; \ + const int i##_end_floor = \ + ((end - begin + stride - 1) / stride) * stride - stride + begin; \ + const int i##_first = dir ? begin : i##_end_floor; \ + const int i##_last = dir ? i##_end_floor : begin; \ + i = i##_first; \ + i_prev = i; \ + for (; dir ? i <= i##_last : i >= i##_last; \ + i = dir ? i + stride : i - stride) + +#define L1_M 8 // 128; +#define L1_N 8 // 128; +#define L1_K 8 // 128; #define L1_LDA L1_K #define L1_LDB L1_N #define L1_LDC L1_N @@ -59,28 +62,25 @@ NAMED_DUMP(TcdmLayout*, l1, 0x8) TcdmLayout* l1AddrGlobal[SNRT_CLUSTER_NUM] = {0}; - - /** * \brief Each cluster performs a GEMM for A, B, C inside each TCDM */ -void gemm_cluster_kernel(double alpha, double beta, - uint32_t M, uint32_t N, uint32_t K, - double* const A, double* const B, double* const C, - int lda, int ldb, int ldc) { +void gemm_cluster_kernel(double alpha, double beta, uint32_t M, uint32_t N, + uint32_t K, double* const A, double* const B, + double* const C, int lda, int ldb, int ldc) { uint32_t p[3], P[3]; ocrt_thread_idx(p); ocrt_compute_thread_num(P); for (uint32_t i = p[0]; i < M; i += P[0]) { for (uint32_t j = 0; j < N; j++) { - uint32_t cIdx = i * ldc + j; // C[i][j] + uint32_t cIdx = i * ldc + j; // C[i][j] // dump_cIdx(cIdx); // dump_c(C[cIdx]); register double c0 = beta * C[cIdx]; for (uint32_t k = 0; k < K; k++) { - uint32_t aIdx = i * lda + k; // A[i][k] - uint32_t bIdx = k * ldb + j; // B[k][j] + uint32_t aIdx = i * lda + k; // A[i][k] + uint32_t bIdx = k * ldb + j; // B[k][j] // dump_aIdx(aIdx); // dump_bIdx(bIdx); // dump_a(A[aIdx]); @@ -94,14 +94,14 @@ void gemm_cluster_kernel(double alpha, double beta, snrt_fpu_fence(); } -void gemm_oc_opt2d(double alpha, double beta, - uint32_t m, uint32_t n, uint32_t k, - double* A, double* B, double* C, - uint32_t lda, uint32_t ldb, uint32_t ldc) { +void gemm_oc_opt2d(double alpha, double beta, uint32_t m, uint32_t n, + uint32_t k, double* A, double* B, double* C, uint32_t lda, + uint32_t ldb, uint32_t ldc) { /** - * Problem is double buffered in L1. The buffer that is used is toggled at each iteration. - * The DMA cores are one index step ahead so they load the data in advance into the buffer that will be used. - */ + * Problem is double buffered in L1. The buffer that is used is toggled at + * each iteration. The DMA cores are one index step ahead so they load the + * data in advance into the buffer that will be used. + */ volatile uint32_t p[3] = {0, 0, 0}; volatile uint32_t P[3] = {0, 0, 0}; @@ -110,19 +110,18 @@ void gemm_oc_opt2d(double alpha, double beta, // Setup layout for TCDM L1 // For double buffering l1 is a size 2 array - TcdmLayout* l1 = (TcdmLayout*) snrt_l1_next(); + TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next(); TcdmLayout* l1Addr[SNRT_CLUSTER_NUM] = {0}; // Sync l1 pointers between clusters - if (snrt_is_dm_core()) - l1AddrGlobal[p[1]] = l1; + if (snrt_is_dm_core()) l1AddrGlobal[p[1]] = l1; snrt_global_barrier(); if (snrt_is_dm_core()) { memcpy(l1Addr, l1AddrGlobal, SNRT_CLUSTER_NUM * sizeof(*l1Addr)); } bool l1Id_AB = false; - bool l1Id_C = false; + bool l1Id_C = false; // Initialize indices const uint32_t I = m, J = n, K = k; @@ -141,12 +140,13 @@ void gemm_oc_opt2d(double alpha, double beta, volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0; if (snrt_is_compute_core()) { - snrt_global_barrier(); // DMA core is one index ahead + snrt_global_barrier(); // DMA core is one index ahead } // Compute C2C sources for 2D pipeline - volatile const uint32_t pk = (PI + 2 * PJ - pi - pj - 1) % PJ; // pipeline step - int PK = PJ; // pipeline depth + volatile const uint32_t pk = + (PI + 2 * PJ - pi - pj - 1) % PJ; // pipeline step + int PK = PJ; // pipeline depth // Determine C2C source cluster index for each matrix, < 0 is from DRAM TcdmLayout* c2cL1_A = NULL; @@ -155,7 +155,7 @@ void gemm_oc_opt2d(double alpha, double beta, dump_pk(pk); const bool fetch_dram = pk == 0; - + volatile const uint32_t p_srcA = pi * PJ + ((2 * PJ - pi - pk) % PJ); volatile const uint32_t p_srcB = pj + PJ * ((2 * PJ - pj - pk) % PJ); dump_p_src(fetch_dram ? -1 : p_srcA); @@ -181,8 +181,7 @@ void gemm_oc_opt2d(double alpha, double beta, dump_ib(ib); dump_jb(jb); snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64); - if (ib != ib_first || jb != jb_first) - storeC = true; + if (ib != ib_first || jb != jb_first) storeC = true; } FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) { @@ -193,13 +192,15 @@ void gemm_oc_opt2d(double alpha, double beta, // load next A, B if (snrt_is_dm_core()) { if (c2cL1_A == NULL) - snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, FP64); + snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, + FP64); else { double* const c2c_A = c2cL1_A[l1Id_AB].A; snrt_dma_start_1d(l1_A, c2c_A, L1_M * L1_K * FP64); } if (c2cL1_B == NULL) - snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, FP64); + snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, + FP64); else { double* const c2c_B = c2cL1_B[l1Id_AB].B; snrt_dma_start_1d(l1_B, c2c_B, L1_K * L1_N * FP64); @@ -207,37 +208,41 @@ void gemm_oc_opt2d(double alpha, double beta, snrt_dma_wait_all(); } else { - // solve block already in l1, parallelize inside each cluster - gemm_cluster_kernel(alpha, beta, L1_M, L1_N, L1_K, l1_A, l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC); + // solve block already in l1, parallelize inside each + // cluster + gemm_cluster_kernel(alpha, beta, L1_M, L1_N, L1_K, l1_A, + l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC); // gemm(FP64, 0, true, false, false, // L1_M, L1_N, L1_K, alpha, // l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC); } - l1Id_AB = !l1Id_AB; // switch buffers + l1Id_AB = !l1Id_AB; // switch buffers snrt_global_barrier(); if (snrt_is_dm_core()) { if (storeC) { storeC = false; - snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64); + snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, + jb_prev, L1_M, L1_N, ldc, FP64); } } kb_prev = kb; } - l1Id_C = !l1Id_C; // switch buffers + l1Id_C = !l1Id_C; // switch buffers jb_prev = jb; ib_prev = ib; } } if (snrt_is_dm_core()) { - snrt_global_barrier(); // DMA core is one index ahead + snrt_global_barrier(); // DMA core is one index ahead // store final tile - snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64); + snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, + ldc, FP64); snrt_dma_wait_all(); } diff --git a/sw/blas/gemm/src/gemm_occamy_baseline.h b/sw/blas/gemm/src/gemm_occamy_baseline.h index cd7c3eefb..67d66ce82 100644 --- a/sw/blas/gemm/src/gemm_occamy_baseline.h +++ b/sw/blas/gemm/src/gemm_occamy_baseline.h @@ -2,12 +2,12 @@ #pragma once -#include #include +#include #include -#include "snrt.h" #include "gemm.h" +#include "snrt.h" #include "dump.h" NAMED_DUMP(uint32_t, aIdx, 0x1a) @@ -25,21 +25,24 @@ NAMED_DUMP(double, c, 0xc) * \param begin Beginning of the range * \param end End of the range * \param dir Sets the direction of traversal. True: loop starts at begin. - * \param i_prev Set the previous index to the first index, must update this manually at the end of the loop. - * \details i_end_floor will contain the exact end with the stride, s.t. the reversed loop starts at the correct index. + * \param i_prev Set the previous index to the first index, must update this + * manually at the end of the loop. \details i_end_floor will contain the exact + * end with the stride, s.t. the reversed loop starts at the correct index. */ -#define FOR_EACH(i, begin, end, stride, dir, i_prev) \ - dir = !dir; \ - const int i##_end_floor = ((end - begin + stride - 1) / stride) * stride - stride + begin; \ - const int i##_first = dir ? begin : i##_end_floor; \ - const int i##_last = dir ? i##_end_floor : begin; \ - i = i##_first; \ - i_prev = i; \ - for (; dir ? i <= i##_last : i >= i##_last; i = dir ? i + stride : i - stride) - -#define L1_M 8 //128; -#define L1_N 8 //128; -#define L1_K 8 //128; +#define FOR_EACH(i, begin, end, stride, dir, i_prev) \ + dir = !dir; \ + const int i##_end_floor = \ + ((end - begin + stride - 1) / stride) * stride - stride + begin; \ + const int i##_first = dir ? begin : i##_end_floor; \ + const int i##_last = dir ? i##_end_floor : begin; \ + i = i##_first; \ + i_prev = i; \ + for (; dir ? i <= i##_last : i >= i##_last; \ + i = dir ? i + stride : i - stride) + +#define L1_M 8 // 128; +#define L1_N 8 // 128; +#define L1_K 8 // 128; #define L1_LDA L1_K #define L1_LDB L1_N #define L1_LDC L1_N @@ -58,23 +61,22 @@ NAMED_DUMP(TcdmLayout*, l1, 0x8) /** * \brief Each cluster performs a GEMM for A, B, C inside each TCDM */ -void gemm_cluster_kernel(double alpha, double beta, - uint32_t M, uint32_t N, uint32_t K, - double* const A, double* const B, double* const C, - int lda, int ldb, int ldc) { +void gemm_cluster_kernel(double alpha, double beta, uint32_t M, uint32_t N, + uint32_t K, double* const A, double* const B, + double* const C, int lda, int ldb, int ldc) { uint32_t p[3], P[3]; ocrt_thread_idx(p); ocrt_compute_thread_num(P); for (uint32_t i = p[0]; i < M; i += P[0]) { for (uint32_t j = 0; j < N; j++) { - uint32_t cIdx = i * ldc + j; // C[i][j] + uint32_t cIdx = i * ldc + j; // C[i][j] // dump_cIdx(cIdx); // dump_c(C[cIdx]); register double c0 = beta * C[cIdx]; for (uint32_t k = 0; k < K; k++) { - uint32_t aIdx = i * lda + k; // A[i][k] - uint32_t bIdx = k * ldb + j; // B[k][j] + uint32_t aIdx = i * lda + k; // A[i][k] + uint32_t bIdx = k * ldb + j; // B[k][j] // dump_aIdx(aIdx); // dump_bIdx(bIdx); // dump_a(A[aIdx]); @@ -88,14 +90,14 @@ void gemm_cluster_kernel(double alpha, double beta, snrt_fpu_fence(); } -void gemm_oc_baseline(double alpha, double beta, - uint32_t m, uint32_t n, uint32_t k, - double* A, double* B, double* C, - uint32_t lda, uint32_t ldb, uint32_t ldc) { +void gemm_oc_baseline(double alpha, double beta, uint32_t m, uint32_t n, + uint32_t k, double* A, double* B, double* C, uint32_t lda, + uint32_t ldb, uint32_t ldc) { /** - * Problem is double buffered in L1. The buffer that is used is toggled at each iteration. - * The DMA cores are one index step ahead so they load the data in advance into the buffer that will be used. - */ + * Problem is double buffered in L1. The buffer that is used is toggled at + * each iteration. The DMA cores are one index step ahead so they load the + * data in advance into the buffer that will be used. + */ volatile uint32_t p[3] = {0, 0, 0}; volatile uint32_t P[3] = {0, 0, 0}; @@ -104,10 +106,10 @@ void gemm_oc_baseline(double alpha, double beta, // Setup layout for TCDM L1 // For double buffering l1 is a size 2 array - TcdmLayout* l1 = (TcdmLayout*) snrt_l1_next(); + TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next(); bool l1Id_AB = false; - bool l1Id_C = false; + bool l1Id_C = false; // Initialize indices const uint32_t I = m, J = n, K = k; @@ -126,26 +128,28 @@ void gemm_oc_baseline(double alpha, double beta, volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0; if (snrt_is_compute_core()) { - snrt_cluster_hw_barrier(); // DMA core is one index ahead + snrt_cluster_hw_barrier(); // DMA core is one index ahead } // FOR_EACH(ib, pi, I / L1_M, PI, ib_dir, ib_prev) { - ib_dir = !ib_dir; + ib_dir = !ib_dir; const int ib_end_floor = ((I / 8 - pi + PI - 1) / PI) * PI - PI + pi; - const int ib_first = ib_dir ? pi : ib_end_floor; - const int ib_last = ib_dir ? ib_end_floor : pi; - ib = ib_first; - ib_prev = ib; - for (; ib_dir ? ib <= ib_last : ib >= ib_last; ib = ib_dir ? ib + PI : ib - PI) { + const int ib_first = ib_dir ? pi : ib_end_floor; + const int ib_last = ib_dir ? ib_end_floor : pi; + ib = ib_first; + ib_prev = ib; + for (; ib_dir ? ib <= ib_last : ib >= ib_last; + ib = ib_dir ? ib + PI : ib - PI) { ib_cnt += ib; // FOR_EACH(jb, pj, J / L1_N, PJ, jb_dir, jb_prev) { - jb_dir = !jb_dir; + jb_dir = !jb_dir; const int jb_end_floor = ((J / 8 - pj + PJ - 1) / PJ) * PJ - PJ + pj; - const int jb_first = jb_dir ? pj : jb_end_floor; - const int jb_last = jb_dir ? jb_end_floor : pj; - jb = jb_first; - jb_prev = jb; - for (; jb_dir ? jb <= jb_last : jb >= jb_last; jb = jb_dir ? jb + PJ : jb - PJ) { + const int jb_first = jb_dir ? pj : jb_end_floor; + const int jb_last = jb_dir ? jb_end_floor : pj; + jb = jb_first; + jb_prev = jb; + for (; jb_dir ? jb <= jb_last : jb >= jb_last; + jb = jb_dir ? jb + PJ : jb - PJ) { jb_cnt += jb; double* const l1_C = l1[l1Id_C].C; @@ -154,60 +158,64 @@ void gemm_oc_baseline(double alpha, double beta, dump_ib(ib); dump_jb(jb); snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64); - if (ib != ib_first || jb != jb_first) - storeC = true; + if (ib != ib_first || jb != jb_first) storeC = true; } // FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) { - kb_dir = !kb_dir; + kb_dir = !kb_dir; const int kb_end_floor = ((K / L1_K - 0 + 1 - 1) / 1) * 1 - 1 + 0; - const int kb_first = kb_dir ? 0 : kb_end_floor; - const int kb_last = kb_dir ? kb_end_floor : 0; - kb = kb_first; - kb_prev = kb; - for (; kb_dir ? kb <= kb_last : kb >= kb_last; kb = kb_dir ? kb + 1 : kb - 1) { + const int kb_first = kb_dir ? 0 : kb_end_floor; + const int kb_last = kb_dir ? kb_end_floor : 0; + kb = kb_first; + kb_prev = kb; + for (; kb_dir ? kb <= kb_last : kb >= kb_last; + kb = kb_dir ? kb + 1 : kb - 1) { kb_cnt += kb; double* const l1_A = l1[l1Id_AB].A; double* const l1_B = l1[l1Id_AB].B; // load next A, B if (snrt_is_dm_core()) { - snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, FP64); - snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, FP64); + snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, + FP64); + snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, + FP64); snrt_dma_wait_all(); } else { - // solve block already in l1, parallelize inside each cluster - // gemm_cluster_kernel(alpha, beta, L1_M, L1_N, L1_K, l1_A, l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC); + // solve block already in l1, parallelize inside each + // cluster gemm_cluster_kernel(alpha, beta, L1_M, L1_N, + // L1_K, l1_A, l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC); - gemm(FP64, 0, true, false, false, - L1_M, L1_N, L1_K, alpha, + gemm(FP64, 0, true, false, false, L1_M, L1_N, L1_K, alpha, l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC); } - l1Id_AB = !l1Id_AB; // switch buffers + l1Id_AB = !l1Id_AB; // switch buffers snrt_cluster_hw_barrier(); if (snrt_is_dm_core()) { if (storeC) { storeC = false; - snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64); + snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, + jb_prev, L1_M, L1_N, ldc, FP64); } } kb_prev = kb; } - l1Id_C = !l1Id_C; // switch buffers + l1Id_C = !l1Id_C; // switch buffers jb_prev = jb; ib_prev = ib; } } if (snrt_is_dm_core()) { - snrt_cluster_hw_barrier(); // DMA core is one index ahead + snrt_cluster_hw_barrier(); // DMA core is one index ahead // store final tile - snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64); + snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, + ldc, FP64); snrt_dma_wait_all(); } diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c index e5b63b351..183986970 100644 --- a/sw/blas/gemm/src/main.c +++ b/sw/blas/gemm/src/main.c @@ -6,8 +6,8 @@ // Luca Colagrande #include -#include #include +#include #include "snrt.h" @@ -27,8 +27,8 @@ int main() { uint32_t ldb = N; uint32_t ldc = N; - gemm_oc(dtype_size, expand, setup_ssr, TA, TB, M, N, K, 1, - a, lda, b, ldb, BETA, c, ldc); + gemm_oc(dtype_size, expand, setup_ssr, TA, TB, M, N, K, 1, a, lda, b, ldb, + BETA, c, ldc); uint32_t end_cycle = snrt_mcycle(); @@ -42,8 +42,7 @@ int main() { for (uint32_t m = 0; m < M; m++) { for (uint32_t n = 0; n < N; n++) { uint32_t idx = m * N + n; - if (fabs(result[idx] - c[idx]) < 0.001) - errors--; + if (fabs(result[idx] - c[idx]) < 0.001) errors--; } } // printf("%d/%d Errors\n", errors, M * N); @@ -135,7 +134,8 @@ int main() { // errors--; // break; // case FP32: -// if (fabs(result[idx] - ((float *)local_c)[idx]) > 0.001) +// if (fabs(result[idx] - ((float *)local_c)[idx]) > +// 0.001) // errors--; // break; // case FP16: