Move args into tcdm before timing, simplify l1ptr calculation

pulp-platform · Jan 17, 2024 · 459f106 · 459f106
1 parent b29937d
commit 459f106
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 40 deletions.
diff --git a/sw/blas/gemm/src/gemm_kernel.h b/sw/blas/gemm/src/gemm_kernel.h
@@ -6,6 +6,8 @@
 //         Luca Bertaccini <[email protected]>
 //         Luca Colagrande <[email protected]>
 
+#pragma once
+
 #include <stdint.h>
 
 #include "snrt.h"

diff --git a/sw/blas/gemm/src/gemm_occamy_2dpipe.h b/sw/blas/gemm/src/gemm_occamy_2dpipe.h
@@ -6,7 +6,7 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "gemm.h"
+#include "gemm_kernel.h"
 #include "snrt.h"
 
 #include "dump.h"
@@ -110,20 +110,7 @@ void gemm_oc_opt2d(double alpha, double beta, uint32_t m, uint32_t n,
 
     // Setup layout for TCDM L1
     // For double buffering l1 is a size 2 array
-    TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next();
-    TcdmLayout* l1Addr[SNRT_CLUSTER_NUM] = {0};
-
-    // Sync l1 pointers between clusters
-    if (snrt_is_dm_core()) {
-        l1AddrGlobal[p[1]] = l1;
-    }
-    snrt_global_barrier();
-    if (snrt_is_dm_core()) {
-        for (int i = 0; i < SNRT_CLUSTER_NUM; ++i)
-            l1Addr[i] = l1 + cluster_offset * (i - snrt_cluster_idx());
-
-        // memcpy(l1Addr, l1AddrGlobal, SNRT_CLUSTER_NUM * sizeof(*l1Addr));
-    }
+    TcdmLayout* l1 = snrt_l1_next();
 
     bool l1Id_AB = false;
     bool l1Id_C = false;
@@ -139,35 +126,39 @@ void gemm_oc_opt2d(double alpha, double beta, uint32_t m, uint32_t n,
     int ib_prev, jb_prev, kb_prev;
     bool ib_dir = false, jb_dir = false, kb_dir = false;
 
-    bool storeC = false;
-
     // Debug
     volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0;
 
-    if (snrt_is_compute_core()) {
-        snrt_global_barrier();  // DMA core is one index ahead
-    }
+    bool storeC = false;
 
-    // Compute C2C sources for 2D pipeline
-    volatile const uint32_t pk =
-        (PI + 2 * PJ - pi - pj - 1) % PJ;  // pipeline step
-    int PK = PJ;                           // pipeline depth
+    // -- Compute C2C sources for 2D pipeline
+    const uint32_t pk = (PI + 2 * PJ - pi - pj - 1) % PJ; // pipeline step
+    const int PK      = PJ;                               // pipeline depth
 
     // Determine C2C source cluster index for each matrix, < 0 is from DRAM
     TcdmLayout* c2cL1_A = NULL;
     TcdmLayout* c2cL1_B = NULL;
     if (snrt_is_dm_core()) {
-        dump_pk(pk);
-
-        const bool fetch_dram = pk == 0;
+        // -- Sync l1 pointers between clusters
+        TcdmLayout* l1Ptr[SNRT_CLUSTER_NUM];
+        for (int i = 0; i < SNRT_CLUSTER_NUM; ++i)
+            l1Ptr[i] = l1 + cluster_offset * (i - snrt_cluster_idx());
 
+        // 2D pipeline indices, see notes or python notebook for details
+        // Works for PI = PJ
         volatile const uint32_t p_srcA = pi * PJ + ((2 * PJ - pi - pk) % PJ);
         volatile const uint32_t p_srcB = pj + PJ * ((2 * PJ - pj - pk) % PJ);
-        dump_p_src(fetch_dram ? -1 : p_srcA);
-        dump_p_src(fetch_dram ? -1 : p_srcB);
 
-        c2cL1_A = fetch_dram ? NULL : l1Addr[p_srcA];
-        c2cL1_B = fetch_dram ? NULL : l1Addr[p_srcB];
+        const bool fetch_dram = pk == 0;
+        c2cL1_A = fetch_dram ? NULL : l1Ptr[p_srcA];
+        c2cL1_B = fetch_dram ? NULL : l1Ptr[p_srcB];
+
+        // dump_p_src(fetch_dram ? -1 : p_srcA);
+        // dump_p_src(fetch_dram ? -1 : p_srcB);
+    }
+
+    if (snrt_is_compute_core()) {
+        snrt_global_barrier();  // DMA core is one index ahead
     }
 
     // Wait for pipeline to be filled

diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c
@@ -21,16 +21,30 @@ NAMED_DUMP(uint32_t, err, 0x7)
 
 int main() {
     const bool setup_ssr = true;
-    uint32_t start_cycle = snrt_mcycle();
+
+    // load into TCDM
+    uint32_t data_M          = M;
+    uint32_t data_N          = N;
+    uint32_t data_K          = K;
+    uint32_t data_TA         = TA;
+    uint32_t data_TB         = TB;
+    double data_BETA         = BETA;
+    uint32_t data_dtype_size = dtype_size;
+    uint32_t data_expand     = expand;
+    double* data_a           = a;
+    double* data_b           = b;
+    double* data_c           = c;
 
     uint32_t lda = K;
     uint32_t ldb = N;
     uint32_t ldc = N;
 
-    gemm_oc(dtype_size, expand, setup_ssr, TA, TB, M, N, K, 1, a, lda, b, ldb,
-            BETA, c, ldc);
-
-    uint32_t end_cycle = snrt_mcycle();
+    for (volatile int i = 2; i > 0; --i) {
+        if (i == 1) snrt_mcycle(); // start
+        gemm_oc(data_dtype_size, data_expand, setup_ssr, data_TA, data_TB, data_M, data_N, data_K, 1,
+                data_a, lda, data_b, ldb, data_BETA, data_c, ldc);
+        if (i == 1) snrt_mcycle(); // end
+    }
 
     snrt_fpu_fence();
     snrt_global_barrier();
@@ -39,10 +53,11 @@ int main() {
     uint32_t errors = M * N;
 
     if (snrt_global_core_idx() == 0) {
-        for (uint32_t m = 0; m < M; m++) {
-            for (uint32_t n = 0; n < N; n++) {
-                uint32_t idx = m * N + n;
-                if (fabs(result[idx] - c[idx]) < 0.001) errors--;
+        for (uint32_t i = 0; i < M; i++) {
+            for (uint32_t j = 0; j < N; j++) {
+                uint32_t idx = i * N + j;
+                if (fabs(result[idx] - c[idx]) < 0.001)
+                    errors--;
             }
         }
         // printf("%d/%d Errors\n", errors, M * N);