clang-format

pulp-platform · Jan 15, 2024 · 220a908 · 220a908
1 parent 68d5e3f
commit 220a908
Show file tree

Hide file tree

Showing 4 changed files with 184 additions and 167 deletions.
diff --git a/sw/blas/gemm/src/gemm_occamy_1dpipe.h b/sw/blas/gemm/src/gemm_occamy_1dpipe.h
@@ -2,12 +2,12 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <stdbool.h>
+#include <stdint.h>
 #include <string.h>
 
-#include "snrt.h"
 #include "gemm.h"
+#include "snrt.h"
 
 #include "dump.h"
 NAMED_DUMP(uint32_t, aIdx, 0x1a)
@@ -20,29 +20,32 @@ NAMED_DUMP(double, a, 0xa)
 NAMED_DUMP(double, b, 0xb)
 NAMED_DUMP(double, c, 0xc)
 
-#define MIN(a,b) ((a)<(b)?(a):(b))
-#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
 
 /**
  * \brief Implements a reversing loop for an index range
  * \param begin Beginning of the range
  * \param end End of the range
  * \param dir Sets the direction of traversal. True: loop starts at begin.
- * \param i_prev Set the previous index to the first index, must update this manually at the end of the loop.
- * \details i_end_floor will contain the exact end with the stride, s.t. the reversed loop starts at the correct index.
+ * \param i_prev Set the previous index to the first index, must update this
+ * manually at the end of the loop. \details i_end_floor will contain the exact
+ * end with the stride, s.t. the reversed loop starts at the correct index.
  */
-#define FOR_EACH(i, begin, end, stride, dir, i_prev)                                                                   \
-    dir = !dir;                                                                                                        \
-    const int i##_end_floor = ((end - begin + stride - 1) / stride) * stride - stride + begin;                         \
-    const int i##_first     = dir ? begin : i##_end_floor;                                                             \
-    const int i##_last      = dir ? i##_end_floor : begin;                                                             \
-    i                       = i##_first;                                                                               \
-    i_prev                  = i;                                                                                       \
-    for (; dir ? i <= i##_last : i >= i##_last; i = dir ? i + stride : i - stride)
-
-#define L1_M 8 //128;
-#define L1_N 8 //128;
-#define L1_K 8 //128;
+#define FOR_EACH(i, begin, end, stride, dir, i_prev)                     \
+    dir = !dir;                                                          \
+    const int i##_end_floor =                                            \
+        ((end - begin + stride - 1) / stride) * stride - stride + begin; \
+    const int i##_first = dir ? begin : i##_end_floor;                   \
+    const int i##_last = dir ? i##_end_floor : begin;                    \
+    i = i##_first;                                                       \
+    i_prev = i;                                                          \
+    for (; dir ? i <= i##_last : i >= i##_last;                          \
+         i = dir ? i + stride : i - stride)
+
+#define L1_M 8  // 128;
+#define L1_N 8  // 128;
+#define L1_K 8  // 128;
 #define L1_LDA L1_K
 #define L1_LDB L1_N
 #define L1_LDC L1_N
@@ -60,14 +63,14 @@ NAMED_DUMP(TcdmLayout*, l1, 0x8)
 
 TcdmLayout* l1AddrGlobal[SNRT_CLUSTER_NUM] = {0};
 
-void gemm_oc_opt1d(double alpha, double beta,
-                      uint32_t m, uint32_t n, uint32_t k,
-                      double* A, double* B, double* C,
-                      uint32_t lda, uint32_t ldb, uint32_t ldc) {
+void gemm_oc_opt1d(double alpha, double beta, uint32_t m, uint32_t n,
+                   uint32_t k, double* A, double* B, double* C, uint32_t lda,
+                   uint32_t ldb, uint32_t ldc) {
     /**
-    * Problem is double buffered in L1. The buffer that is used is toggled at each iteration.
-    * The DMA cores are one index step ahead so they load the data in advance into the buffer that will be used.
-    */
+     * Problem is double buffered in L1. The buffer that is used is toggled at
+     * each iteration. The DMA cores are one index step ahead so they load the
+     * data in advance into the buffer that will be used.
+     */
 
     volatile uint32_t p[3] = {0, 0, 0};
     volatile uint32_t P[3] = {0, 0, 0};
@@ -76,12 +79,11 @@ void gemm_oc_opt1d(double alpha, double beta,
 
     // Setup layout for TCDM L1
     // For double buffering l1 is a size 2 array
-    TcdmLayout* l1 = (TcdmLayout*) snrt_l1_next();
+    TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next();
     TcdmLayout* l1Addr[SNRT_CLUSTER_NUM] = {0};
 
     // Sync l1 pointers between clusters
-    if (snrt_is_dm_core())
-        l1AddrGlobal[p[1]] = l1;
+    if (snrt_is_dm_core()) l1AddrGlobal[p[1]] = l1;
     snrt_global_barrier();
     if (snrt_is_dm_core()) {
         memcpy(l1Addr, l1AddrGlobal, SNRT_CLUSTER_NUM * sizeof(*l1Addr));
@@ -91,7 +93,7 @@ void gemm_oc_opt1d(double alpha, double beta,
     }
 
     bool l1Id_AB = false;
-    bool l1Id_C  = false;
+    bool l1Id_C = false;
 
     // Initialize indices
     const uint32_t I = m, J = n, K = k;
@@ -110,20 +112,19 @@ void gemm_oc_opt1d(double alpha, double beta,
     volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0;
 
     if (snrt_is_compute_core()) {
-        snrt_global_barrier(); // DMA core is one index ahead
+        snrt_global_barrier();  // DMA core is one index ahead
     }
 
     // -- Compute C2C sources for 2D pipeline
-    volatile const uint32_t pk = MAX(pi, pj); // pipeline step
-    int PK                     = MAX(PI, PJ); // pipeline depth
+    volatile const uint32_t pk = MAX(pi, pj);  // pipeline step
+    int PK = MAX(PI, PJ);                      // pipeline depth
 
     // Determine C2C source cluster index for each matrix, < 0 is from DRAM
     int pIdx_A = pj - 1;
     int pIdx_B = pi - 1;
     TcdmLayout* const p_srcA = pIdx_A == 0 ? NULL : l1Addr[pIdx_A - 1];
     TcdmLayout* const p_srcB = pIdx_B == 0 ? NULL : l1Addr[pIdx_B - 1];
 
-
     // Wait for pipeline to be filled
     for (int pipeline = pk; pipeline > 0; --pipeline) {
         snrt_global_barrier();
@@ -140,8 +141,7 @@ void gemm_oc_opt1d(double alpha, double beta,
                 dump_ib(ib);
                 dump_jb(jb);
                 snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64);
-                if (ib != ib_first || jb != jb_first)
-                    storeC = true;
+                if (ib != ib_first || jb != jb_first) storeC = true;
             }
 
             FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) {
@@ -153,54 +153,58 @@ void gemm_oc_opt1d(double alpha, double beta,
                 if (snrt_is_dm_core()) {
                     // TODO: use multicast instead
                     if (p_srcA == NULL)
-                        snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, FP64);
+                        snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda,
+                                              FP64);
                     else {
                         double* const c2c_A = p_srcA[l1Id_AB].A;
                         snrt_dma_start_1d(l1_A, c2c_A, L1_M * L1_K * FP64);
                     }
                     if (p_srcB == NULL)
-                        snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, FP64);
+                        snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb,
+                                              FP64);
                     else {
                         double* const c2c_B = p_srcB[l1Id_AB].B;
                         snrt_dma_start_1d(l1_B, c2c_B, L1_K * L1_N * FP64);
                     }
 
                     snrt_dma_wait_all();
                 } else {
-                    // solve block already in l1, parallelize inside each cluster
-                    gemm(FP64, 0, true, false, false,
-                         L1_M, L1_N, L1_K, alpha,
+                    // solve block already in l1, parallelize inside each
+                    // cluster
+                    gemm(FP64, 0, true, false, false, L1_M, L1_N, L1_K, alpha,
                          l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC);
                 }
 
-                l1Id_AB = !l1Id_AB; // switch buffers
+                l1Id_AB = !l1Id_AB;  // switch buffers
                 snrt_global_barrier();
 
                 if (snrt_is_dm_core()) {
                     if (storeC) {
                         storeC = false;
-                        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
+                        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev,
+                                               jb_prev, L1_M, L1_N, ldc, FP64);
                     }
                 }
                 kb_prev = kb;
             }
 
-            l1Id_C  = !l1Id_C; // switch buffers
+            l1Id_C = !l1Id_C;  // switch buffers
             jb_prev = jb;
             ib_prev = ib;
         }
     }
 
     if (snrt_is_dm_core()) {
-        snrt_global_barrier(); // DMA core is one index ahead
+        snrt_global_barrier();  // DMA core is one index ahead
 
         // store final tile
-        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
+        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N,
+                               ldc, FP64);
         snrt_dma_wait_all();
     }
 
     // Wait for pipeline to be emptied
-    for (int pipeline = pk; pipeline < PK -1; ++pipeline) {
+    for (int pipeline = pk; pipeline < PK - 1; ++pipeline) {
         snrt_global_barrier();
     }
 }