From 220a908bba292790ee3831437f9e72b54687703a Mon Sep 17 00:00:00 2001
From: Roger Barton <rbarton@student.ethz.ch>
Date: Mon, 15 Jan 2024 16:56:23 +0100
Subject: [PATCH] clang-format

---
 sw/blas/gemm/src/gemm_occamy_1dpipe.h   |  96 +++++++++--------
 sw/blas/gemm/src/gemm_occamy_2dpipe.h   | 107 ++++++++++---------
 sw/blas/gemm/src/gemm_occamy_baseline.h | 136 +++++++++++++-----------
 sw/blas/gemm/src/main.c                 |  12 +--
 4 files changed, 184 insertions(+), 167 deletions(-)

diff --git a/sw/blas/gemm/src/gemm_occamy_1dpipe.h b/sw/blas/gemm/src/gemm_occamy_1dpipe.h
index 7becebbd7..56f63cd9d 100644
--- a/sw/blas/gemm/src/gemm_occamy_1dpipe.h
+++ b/sw/blas/gemm/src/gemm_occamy_1dpipe.h
@@ -2,12 +2,12 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <stdbool.h>
+#include <stdint.h>
 #include <string.h>
 
-#include "snrt.h"
 #include "gemm.h"
+#include "snrt.h"
 
 #include "dump.h"
 NAMED_DUMP(uint32_t, aIdx, 0x1a)
@@ -20,29 +20,32 @@ NAMED_DUMP(double, a, 0xa)
 NAMED_DUMP(double, b, 0xb)
 NAMED_DUMP(double, c, 0xc)
 
-#define MIN(a,b) ((a)<(b)?(a):(b))
-#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
 
 /**
  * \brief Implements a reversing loop for an index range
  * \param begin Beginning of the range
  * \param end End of the range
  * \param dir Sets the direction of traversal. True: loop starts at begin.
- * \param i_prev Set the previous index to the first index, must update this manually at the end of the loop.
- * \details i_end_floor will contain the exact end with the stride, s.t. the reversed loop starts at the correct index.
+ * \param i_prev Set the previous index to the first index, must update this
+ * manually at the end of the loop. \details i_end_floor will contain the exact
+ * end with the stride, s.t. the reversed loop starts at the correct index.
  */
-#define FOR_EACH(i, begin, end, stride, dir, i_prev)                                                                   \
-    dir = !dir;                                                                                                        \
-    const int i##_end_floor = ((end - begin + stride - 1) / stride) * stride - stride + begin;                         \
-    const int i##_first     = dir ? begin : i##_end_floor;                                                             \
-    const int i##_last      = dir ? i##_end_floor : begin;                                                             \
-    i                       = i##_first;                                                                               \
-    i_prev                  = i;                                                                                       \
-    for (; dir ? i <= i##_last : i >= i##_last; i = dir ? i + stride : i - stride)
-
-#define L1_M 8 //128;
-#define L1_N 8 //128;
-#define L1_K 8 //128;
+#define FOR_EACH(i, begin, end, stride, dir, i_prev)                     \
+    dir = !dir;                                                          \
+    const int i##_end_floor =                                            \
+        ((end - begin + stride - 1) / stride) * stride - stride + begin; \
+    const int i##_first = dir ? begin : i##_end_floor;                   \
+    const int i##_last = dir ? i##_end_floor : begin;                    \
+    i = i##_first;                                                       \
+    i_prev = i;                                                          \
+    for (; dir ? i <= i##_last : i >= i##_last;                          \
+         i = dir ? i + stride : i - stride)
+
+#define L1_M 8  // 128;
+#define L1_N 8  // 128;
+#define L1_K 8  // 128;
 #define L1_LDA L1_K
 #define L1_LDB L1_N
 #define L1_LDC L1_N
@@ -60,14 +63,14 @@ NAMED_DUMP(TcdmLayout*, l1, 0x8)
 
 TcdmLayout* l1AddrGlobal[SNRT_CLUSTER_NUM] = {0};
 
-void gemm_oc_opt1d(double alpha, double beta,
-                      uint32_t m, uint32_t n, uint32_t k,
-                      double* A, double* B, double* C,
-                      uint32_t lda, uint32_t ldb, uint32_t ldc) {
+void gemm_oc_opt1d(double alpha, double beta, uint32_t m, uint32_t n,
+                   uint32_t k, double* A, double* B, double* C, uint32_t lda,
+                   uint32_t ldb, uint32_t ldc) {
     /**
-    * Problem is double buffered in L1. The buffer that is used is toggled at each iteration.
-    * The DMA cores are one index step ahead so they load the data in advance into the buffer that will be used.
-    */
+     * Problem is double buffered in L1. The buffer that is used is toggled at
+     * each iteration. The DMA cores are one index step ahead so they load the
+     * data in advance into the buffer that will be used.
+     */
 
     volatile uint32_t p[3] = {0, 0, 0};
     volatile uint32_t P[3] = {0, 0, 0};
@@ -76,12 +79,11 @@ void gemm_oc_opt1d(double alpha, double beta,
 
     // Setup layout for TCDM L1
     // For double buffering l1 is a size 2 array
-    TcdmLayout* l1 = (TcdmLayout*) snrt_l1_next();
+    TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next();
     TcdmLayout* l1Addr[SNRT_CLUSTER_NUM] = {0};
 
     // Sync l1 pointers between clusters
-    if (snrt_is_dm_core())
-        l1AddrGlobal[p[1]] = l1;
+    if (snrt_is_dm_core()) l1AddrGlobal[p[1]] = l1;
     snrt_global_barrier();
     if (snrt_is_dm_core()) {
         memcpy(l1Addr, l1AddrGlobal, SNRT_CLUSTER_NUM * sizeof(*l1Addr));
@@ -91,7 +93,7 @@ void gemm_oc_opt1d(double alpha, double beta,
     }
 
     bool l1Id_AB = false;
-    bool l1Id_C  = false;
+    bool l1Id_C = false;
 
     // Initialize indices
     const uint32_t I = m, J = n, K = k;
@@ -110,12 +112,12 @@ void gemm_oc_opt1d(double alpha, double beta,
     volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0;
 
     if (snrt_is_compute_core()) {
-        snrt_global_barrier(); // DMA core is one index ahead
+        snrt_global_barrier();  // DMA core is one index ahead
     }
 
     // -- Compute C2C sources for 2D pipeline
-    volatile const uint32_t pk = MAX(pi, pj); // pipeline step
-    int PK                     = MAX(PI, PJ); // pipeline depth
+    volatile const uint32_t pk = MAX(pi, pj);  // pipeline step
+    int PK = MAX(PI, PJ);                      // pipeline depth
 
     // Determine C2C source cluster index for each matrix, < 0 is from DRAM
     int pIdx_A = pj - 1;
@@ -123,7 +125,6 @@ void gemm_oc_opt1d(double alpha, double beta,
     TcdmLayout* const p_srcA = pIdx_A == 0 ? NULL : l1Addr[pIdx_A - 1];
     TcdmLayout* const p_srcB = pIdx_B == 0 ? NULL : l1Addr[pIdx_B - 1];
 
-
     // Wait for pipeline to be filled
     for (int pipeline = pk; pipeline > 0; --pipeline) {
         snrt_global_barrier();
@@ -140,8 +141,7 @@ void gemm_oc_opt1d(double alpha, double beta,
                 dump_ib(ib);
                 dump_jb(jb);
                 snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64);
-                if (ib != ib_first || jb != jb_first)
-                    storeC = true;
+                if (ib != ib_first || jb != jb_first) storeC = true;
             }
 
             FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) {
@@ -153,13 +153,15 @@ void gemm_oc_opt1d(double alpha, double beta,
                 if (snrt_is_dm_core()) {
                     // TODO: use multicast instead
                     if (p_srcA == NULL)
-                        snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, FP64);
+                        snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda,
+                                              FP64);
                     else {
                         double* const c2c_A = p_srcA[l1Id_AB].A;
                         snrt_dma_start_1d(l1_A, c2c_A, L1_M * L1_K * FP64);
                     }
                     if (p_srcB == NULL)
-                        snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, FP64);
+                        snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb,
+                                              FP64);
                     else {
                         double* const c2c_B = p_srcB[l1Id_AB].B;
                         snrt_dma_start_1d(l1_B, c2c_B, L1_K * L1_N * FP64);
@@ -167,40 +169,42 @@ void gemm_oc_opt1d(double alpha, double beta,
 
                     snrt_dma_wait_all();
                 } else {
-                    // solve block already in l1, parallelize inside each cluster
-                    gemm(FP64, 0, true, false, false,
-                         L1_M, L1_N, L1_K, alpha,
+                    // solve block already in l1, parallelize inside each
+                    // cluster
+                    gemm(FP64, 0, true, false, false, L1_M, L1_N, L1_K, alpha,
                          l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC);
                 }
 
-                l1Id_AB = !l1Id_AB; // switch buffers
+                l1Id_AB = !l1Id_AB;  // switch buffers
                 snrt_global_barrier();
 
                 if (snrt_is_dm_core()) {
                     if (storeC) {
                         storeC = false;
-                        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
+                        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev,
+                                               jb_prev, L1_M, L1_N, ldc, FP64);
                     }
                 }
                 kb_prev = kb;
             }
 
-            l1Id_C  = !l1Id_C; // switch buffers
+            l1Id_C = !l1Id_C;  // switch buffers
             jb_prev = jb;
             ib_prev = ib;
         }
     }
 
     if (snrt_is_dm_core()) {
-        snrt_global_barrier(); // DMA core is one index ahead
+        snrt_global_barrier();  // DMA core is one index ahead
 
         // store final tile
-        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
+        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N,
+                               ldc, FP64);
         snrt_dma_wait_all();
     }
 
     // Wait for pipeline to be emptied
-    for (int pipeline = pk; pipeline < PK -1; ++pipeline) {
+    for (int pipeline = pk; pipeline < PK - 1; ++pipeline) {
         snrt_global_barrier();
     }
 }
diff --git a/sw/blas/gemm/src/gemm_occamy_2dpipe.h b/sw/blas/gemm/src/gemm_occamy_2dpipe.h
index a65a4f8e1..054919644 100644
--- a/sw/blas/gemm/src/gemm_occamy_2dpipe.h
+++ b/sw/blas/gemm/src/gemm_occamy_2dpipe.h
@@ -2,12 +2,12 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <stdbool.h>
+#include <stdint.h>
 #include <string.h>
 
-#include "snrt.h"
 #include "gemm.h"
+#include "snrt.h"
 
 #include "dump.h"
 NAMED_DUMP(uint32_t, aIdx, 0x1a)
@@ -27,21 +27,24 @@ NAMED_DUMP(double, c, 0xc)
  * \param begin Beginning of the range
  * \param end End of the range
  * \param dir Sets the direction of traversal. True: loop starts at begin.
- * \param i_prev Set the previous index to the first index, must update this manually at the end of the loop.
- * \details i_end_floor will contain the exact end with the stride, s.t. the reversed loop starts at the correct index.
+ * \param i_prev Set the previous index to the first index, must update this
+ * manually at the end of the loop. \details i_end_floor will contain the exact
+ * end with the stride, s.t. the reversed loop starts at the correct index.
  */
-#define FOR_EACH(i, begin, end, stride, dir, i_prev)                                                                   \
-    dir = !dir;                                                                                                        \
-    const int i##_end_floor = ((end - begin + stride - 1) / stride) * stride - stride + begin;                         \
-    const int i##_first     = dir ? begin : i##_end_floor;                                                             \
-    const int i##_last      = dir ? i##_end_floor : begin;                                                             \
-    i                       = i##_first;                                                                               \
-    i_prev                  = i;                                                                                       \
-    for (; dir ? i <= i##_last : i >= i##_last; i = dir ? i + stride : i - stride)
-
-#define L1_M 8 //128;
-#define L1_N 8 //128;
-#define L1_K 8 //128;
+#define FOR_EACH(i, begin, end, stride, dir, i_prev)                     \
+    dir = !dir;                                                          \
+    const int i##_end_floor =                                            \
+        ((end - begin + stride - 1) / stride) * stride - stride + begin; \
+    const int i##_first = dir ? begin : i##_end_floor;                   \
+    const int i##_last = dir ? i##_end_floor : begin;                    \
+    i = i##_first;                                                       \
+    i_prev = i;                                                          \
+    for (; dir ? i <= i##_last : i >= i##_last;                          \
+         i = dir ? i + stride : i - stride)
+
+#define L1_M 8  // 128;
+#define L1_N 8  // 128;
+#define L1_K 8  // 128;
 #define L1_LDA L1_K
 #define L1_LDB L1_N
 #define L1_LDC L1_N
@@ -59,28 +62,25 @@ NAMED_DUMP(TcdmLayout*, l1, 0x8)
 
 TcdmLayout* l1AddrGlobal[SNRT_CLUSTER_NUM] = {0};
 
-
-
 /**
  * \brief Each cluster performs a GEMM for A, B, C inside each TCDM
  */
-void gemm_cluster_kernel(double alpha, double beta,
-                         uint32_t M, uint32_t N, uint32_t K,
-                         double* const A, double* const B, double* const C,
-                         int lda, int ldb, int ldc) {
+void gemm_cluster_kernel(double alpha, double beta, uint32_t M, uint32_t N,
+                         uint32_t K, double* const A, double* const B,
+                         double* const C, int lda, int ldb, int ldc) {
     uint32_t p[3], P[3];
     ocrt_thread_idx(p);
     ocrt_compute_thread_num(P);
 
     for (uint32_t i = p[0]; i < M; i += P[0]) {
         for (uint32_t j = 0; j < N; j++) {
-            uint32_t cIdx = i * ldc + j; // C[i][j]
+            uint32_t cIdx = i * ldc + j;  // C[i][j]
             // dump_cIdx(cIdx);
             // dump_c(C[cIdx]);
             register double c0 = beta * C[cIdx];
             for (uint32_t k = 0; k < K; k++) {
-                uint32_t aIdx = i * lda + k; // A[i][k]
-                uint32_t bIdx = k * ldb + j; // B[k][j]
+                uint32_t aIdx = i * lda + k;  // A[i][k]
+                uint32_t bIdx = k * ldb + j;  // B[k][j]
                 // dump_aIdx(aIdx);
                 // dump_bIdx(bIdx);
                 // dump_a(A[aIdx]);
@@ -94,14 +94,14 @@ void gemm_cluster_kernel(double alpha, double beta,
     snrt_fpu_fence();
 }
 
-void gemm_oc_opt2d(double alpha, double beta,
-                   uint32_t m, uint32_t n, uint32_t k,
-                   double* A, double* B, double* C,
-                   uint32_t lda, uint32_t ldb, uint32_t ldc) {
+void gemm_oc_opt2d(double alpha, double beta, uint32_t m, uint32_t n,
+                   uint32_t k, double* A, double* B, double* C, uint32_t lda,
+                   uint32_t ldb, uint32_t ldc) {
     /**
-    * Problem is double buffered in L1. The buffer that is used is toggled at each iteration.
-    * The DMA cores are one index step ahead so they load the data in advance into the buffer that will be used.
-    */
+     * Problem is double buffered in L1. The buffer that is used is toggled at
+     * each iteration. The DMA cores are one index step ahead so they load the
+     * data in advance into the buffer that will be used.
+     */
 
     volatile uint32_t p[3] = {0, 0, 0};
     volatile uint32_t P[3] = {0, 0, 0};
@@ -110,19 +110,18 @@ void gemm_oc_opt2d(double alpha, double beta,
 
     // Setup layout for TCDM L1
     // For double buffering l1 is a size 2 array
-    TcdmLayout* l1                       = (TcdmLayout*) snrt_l1_next();
+    TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next();
     TcdmLayout* l1Addr[SNRT_CLUSTER_NUM] = {0};
 
     // Sync l1 pointers between clusters
-    if (snrt_is_dm_core())
-        l1AddrGlobal[p[1]] = l1;
+    if (snrt_is_dm_core()) l1AddrGlobal[p[1]] = l1;
     snrt_global_barrier();
     if (snrt_is_dm_core()) {
         memcpy(l1Addr, l1AddrGlobal, SNRT_CLUSTER_NUM * sizeof(*l1Addr));
     }
 
     bool l1Id_AB = false;
-    bool l1Id_C  = false;
+    bool l1Id_C = false;
 
     // Initialize indices
     const uint32_t I = m, J = n, K = k;
@@ -141,12 +140,13 @@ void gemm_oc_opt2d(double alpha, double beta,
     volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0;
 
     if (snrt_is_compute_core()) {
-        snrt_global_barrier(); // DMA core is one index ahead
+        snrt_global_barrier();  // DMA core is one index ahead
     }
 
     // Compute C2C sources for 2D pipeline
-    volatile const uint32_t pk = (PI + 2 * PJ - pi - pj - 1) % PJ; // pipeline step
-    int PK                     = PJ; // pipeline depth
+    volatile const uint32_t pk =
+        (PI + 2 * PJ - pi - pj - 1) % PJ;  // pipeline step
+    int PK = PJ;                           // pipeline depth
 
     // Determine C2C source cluster index for each matrix, < 0 is from DRAM
     TcdmLayout* c2cL1_A = NULL;
@@ -155,7 +155,7 @@ void gemm_oc_opt2d(double alpha, double beta,
         dump_pk(pk);
 
         const bool fetch_dram = pk == 0;
-        
+
         volatile const uint32_t p_srcA = pi * PJ + ((2 * PJ - pi - pk) % PJ);
         volatile const uint32_t p_srcB = pj + PJ * ((2 * PJ - pj - pk) % PJ);
         dump_p_src(fetch_dram ? -1 : p_srcA);
@@ -181,8 +181,7 @@ void gemm_oc_opt2d(double alpha, double beta,
                 dump_ib(ib);
                 dump_jb(jb);
                 snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64);
-                if (ib != ib_first || jb != jb_first)
-                    storeC = true;
+                if (ib != ib_first || jb != jb_first) storeC = true;
             }
 
             FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) {
@@ -193,13 +192,15 @@ void gemm_oc_opt2d(double alpha, double beta,
                 // load next A, B
                 if (snrt_is_dm_core()) {
                     if (c2cL1_A == NULL)
-                        snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, FP64);
+                        snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda,
+                                              FP64);
                     else {
                         double* const c2c_A = c2cL1_A[l1Id_AB].A;
                         snrt_dma_start_1d(l1_A, c2c_A, L1_M * L1_K * FP64);
                     }
                     if (c2cL1_B == NULL)
-                        snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, FP64);
+                        snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb,
+                                              FP64);
                     else {
                         double* const c2c_B = c2cL1_B[l1Id_AB].B;
                         snrt_dma_start_1d(l1_B, c2c_B, L1_K * L1_N * FP64);
@@ -207,37 +208,41 @@ void gemm_oc_opt2d(double alpha, double beta,
 
                     snrt_dma_wait_all();
                 } else {
-                    // solve block already in l1, parallelize inside each cluster
-                    gemm_cluster_kernel(alpha, beta, L1_M, L1_N, L1_K, l1_A, l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC);
+                    // solve block already in l1, parallelize inside each
+                    // cluster
+                    gemm_cluster_kernel(alpha, beta, L1_M, L1_N, L1_K, l1_A,
+                                        l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC);
 
                     // gemm(FP64, 0, true, false, false,
                     //      L1_M, L1_N, L1_K, alpha,
                     //      l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC);
                 }
 
-                l1Id_AB = !l1Id_AB; // switch buffers
+                l1Id_AB = !l1Id_AB;  // switch buffers
                 snrt_global_barrier();
 
                 if (snrt_is_dm_core()) {
                     if (storeC) {
                         storeC = false;
-                        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
+                        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev,
+                                               jb_prev, L1_M, L1_N, ldc, FP64);
                     }
                 }
                 kb_prev = kb;
             }
 
-            l1Id_C  = !l1Id_C; // switch buffers
+            l1Id_C = !l1Id_C;  // switch buffers
             jb_prev = jb;
             ib_prev = ib;
         }
     }
 
     if (snrt_is_dm_core()) {
-        snrt_global_barrier(); // DMA core is one index ahead
+        snrt_global_barrier();  // DMA core is one index ahead
 
         // store final tile
-        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
+        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N,
+                               ldc, FP64);
         snrt_dma_wait_all();
     }
 
diff --git a/sw/blas/gemm/src/gemm_occamy_baseline.h b/sw/blas/gemm/src/gemm_occamy_baseline.h
index cd7c3eefb..67d66ce82 100644
--- a/sw/blas/gemm/src/gemm_occamy_baseline.h
+++ b/sw/blas/gemm/src/gemm_occamy_baseline.h
@@ -2,12 +2,12 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <stdbool.h>
+#include <stdint.h>
 #include <string.h>
 
-#include "snrt.h"
 #include "gemm.h"
+#include "snrt.h"
 
 #include "dump.h"
 NAMED_DUMP(uint32_t, aIdx, 0x1a)
@@ -25,21 +25,24 @@ NAMED_DUMP(double, c, 0xc)
  * \param begin Beginning of the range
  * \param end End of the range
  * \param dir Sets the direction of traversal. True: loop starts at begin.
- * \param i_prev Set the previous index to the first index, must update this manually at the end of the loop.
- * \details i_end_floor will contain the exact end with the stride, s.t. the reversed loop starts at the correct index.
+ * \param i_prev Set the previous index to the first index, must update this
+ * manually at the end of the loop. \details i_end_floor will contain the exact
+ * end with the stride, s.t. the reversed loop starts at the correct index.
  */
-#define FOR_EACH(i, begin, end, stride, dir, i_prev)                                                                   \
-    dir = !dir;                                                                                                        \
-    const int i##_end_floor = ((end - begin + stride - 1) / stride) * stride - stride + begin;                         \
-    const int i##_first     = dir ? begin : i##_end_floor;                                                             \
-    const int i##_last      = dir ? i##_end_floor : begin;                                                             \
-    i                       = i##_first;                                                                               \
-    i_prev                  = i;                                                                                       \
-    for (; dir ? i <= i##_last : i >= i##_last; i = dir ? i + stride : i - stride)
-
-#define L1_M 8 //128;
-#define L1_N 8 //128;
-#define L1_K 8 //128;
+#define FOR_EACH(i, begin, end, stride, dir, i_prev)                     \
+    dir = !dir;                                                          \
+    const int i##_end_floor =                                            \
+        ((end - begin + stride - 1) / stride) * stride - stride + begin; \
+    const int i##_first = dir ? begin : i##_end_floor;                   \
+    const int i##_last = dir ? i##_end_floor : begin;                    \
+    i = i##_first;                                                       \
+    i_prev = i;                                                          \
+    for (; dir ? i <= i##_last : i >= i##_last;                          \
+         i = dir ? i + stride : i - stride)
+
+#define L1_M 8  // 128;
+#define L1_N 8  // 128;
+#define L1_K 8  // 128;
 #define L1_LDA L1_K
 #define L1_LDB L1_N
 #define L1_LDC L1_N
@@ -58,23 +61,22 @@ NAMED_DUMP(TcdmLayout*, l1, 0x8)
 /**
  * \brief Each cluster performs a GEMM for A, B, C inside each TCDM
  */
-void gemm_cluster_kernel(double alpha, double beta,
-                         uint32_t M, uint32_t N, uint32_t K,
-                         double* const A, double* const B, double* const C,
-                         int lda, int ldb, int ldc) {
+void gemm_cluster_kernel(double alpha, double beta, uint32_t M, uint32_t N,
+                         uint32_t K, double* const A, double* const B,
+                         double* const C, int lda, int ldb, int ldc) {
     uint32_t p[3], P[3];
     ocrt_thread_idx(p);
     ocrt_compute_thread_num(P);
 
     for (uint32_t i = p[0]; i < M; i += P[0]) {
         for (uint32_t j = 0; j < N; j++) {
-            uint32_t cIdx = i * ldc + j; // C[i][j]
+            uint32_t cIdx = i * ldc + j;  // C[i][j]
             // dump_cIdx(cIdx);
             // dump_c(C[cIdx]);
             register double c0 = beta * C[cIdx];
             for (uint32_t k = 0; k < K; k++) {
-                uint32_t aIdx = i * lda + k; // A[i][k]
-                uint32_t bIdx = k * ldb + j; // B[k][j]
+                uint32_t aIdx = i * lda + k;  // A[i][k]
+                uint32_t bIdx = k * ldb + j;  // B[k][j]
                 // dump_aIdx(aIdx);
                 // dump_bIdx(bIdx);
                 // dump_a(A[aIdx]);
@@ -88,14 +90,14 @@ void gemm_cluster_kernel(double alpha, double beta,
     snrt_fpu_fence();
 }
 
-void gemm_oc_baseline(double alpha, double beta,
-                      uint32_t m, uint32_t n, uint32_t k,
-                      double* A, double* B, double* C,
-                      uint32_t lda, uint32_t ldb, uint32_t ldc) {
+void gemm_oc_baseline(double alpha, double beta, uint32_t m, uint32_t n,
+                      uint32_t k, double* A, double* B, double* C, uint32_t lda,
+                      uint32_t ldb, uint32_t ldc) {
     /**
-    * Problem is double buffered in L1. The buffer that is used is toggled at each iteration.
-    * The DMA cores are one index step ahead so they load the data in advance into the buffer that will be used.
-    */
+     * Problem is double buffered in L1. The buffer that is used is toggled at
+     * each iteration. The DMA cores are one index step ahead so they load the
+     * data in advance into the buffer that will be used.
+     */
 
     volatile uint32_t p[3] = {0, 0, 0};
     volatile uint32_t P[3] = {0, 0, 0};
@@ -104,10 +106,10 @@ void gemm_oc_baseline(double alpha, double beta,
 
     // Setup layout for TCDM L1
     // For double buffering l1 is a size 2 array
-    TcdmLayout* l1 = (TcdmLayout*) snrt_l1_next();
+    TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next();
 
     bool l1Id_AB = false;
-    bool l1Id_C  = false;
+    bool l1Id_C = false;
 
     // Initialize indices
     const uint32_t I = m, J = n, K = k;
@@ -126,26 +128,28 @@ void gemm_oc_baseline(double alpha, double beta,
     volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0;
 
     if (snrt_is_compute_core()) {
-        snrt_cluster_hw_barrier(); // DMA core is one index ahead
+        snrt_cluster_hw_barrier();  // DMA core is one index ahead
     }
 
     // FOR_EACH(ib, pi, I / L1_M, PI, ib_dir, ib_prev) {
-    ib_dir                 = !ib_dir;
+    ib_dir = !ib_dir;
     const int ib_end_floor = ((I / 8 - pi + PI - 1) / PI) * PI - PI + pi;
-    const int ib_first     = ib_dir ? pi : ib_end_floor;
-    const int ib_last      = ib_dir ? ib_end_floor : pi;
-    ib                     = ib_first;
-    ib_prev                = ib;
-    for (; ib_dir ? ib <= ib_last : ib >= ib_last; ib = ib_dir ? ib + PI : ib - PI) {
+    const int ib_first = ib_dir ? pi : ib_end_floor;
+    const int ib_last = ib_dir ? ib_end_floor : pi;
+    ib = ib_first;
+    ib_prev = ib;
+    for (; ib_dir ? ib <= ib_last : ib >= ib_last;
+         ib = ib_dir ? ib + PI : ib - PI) {
         ib_cnt += ib;
         // FOR_EACH(jb, pj, J / L1_N, PJ, jb_dir, jb_prev) {
-        jb_dir                 = !jb_dir;
+        jb_dir = !jb_dir;
         const int jb_end_floor = ((J / 8 - pj + PJ - 1) / PJ) * PJ - PJ + pj;
-        const int jb_first     = jb_dir ? pj : jb_end_floor;
-        const int jb_last      = jb_dir ? jb_end_floor : pj;
-        jb                     = jb_first;
-        jb_prev                = jb;
-        for (; jb_dir ? jb <= jb_last : jb >= jb_last; jb = jb_dir ? jb + PJ : jb - PJ) {
+        const int jb_first = jb_dir ? pj : jb_end_floor;
+        const int jb_last = jb_dir ? jb_end_floor : pj;
+        jb = jb_first;
+        jb_prev = jb;
+        for (; jb_dir ? jb <= jb_last : jb >= jb_last;
+             jb = jb_dir ? jb + PJ : jb - PJ) {
             jb_cnt += jb;
 
             double* const l1_C = l1[l1Id_C].C;
@@ -154,60 +158,64 @@ void gemm_oc_baseline(double alpha, double beta,
                 dump_ib(ib);
                 dump_jb(jb);
                 snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64);
-                if (ib != ib_first || jb != jb_first)
-                    storeC = true;
+                if (ib != ib_first || jb != jb_first) storeC = true;
             }
 
             // FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) {
-            kb_dir                 = !kb_dir;
+            kb_dir = !kb_dir;
             const int kb_end_floor = ((K / L1_K - 0 + 1 - 1) / 1) * 1 - 1 + 0;
-            const int kb_first     = kb_dir ? 0 : kb_end_floor;
-            const int kb_last      = kb_dir ? kb_end_floor : 0;
-            kb                     = kb_first;
-            kb_prev                = kb;
-            for (; kb_dir ? kb <= kb_last : kb >= kb_last; kb = kb_dir ? kb + 1 : kb - 1) {
+            const int kb_first = kb_dir ? 0 : kb_end_floor;
+            const int kb_last = kb_dir ? kb_end_floor : 0;
+            kb = kb_first;
+            kb_prev = kb;
+            for (; kb_dir ? kb <= kb_last : kb >= kb_last;
+                 kb = kb_dir ? kb + 1 : kb - 1) {
                 kb_cnt += kb;
                 double* const l1_A = l1[l1Id_AB].A;
                 double* const l1_B = l1[l1Id_AB].B;
 
                 // load next A, B
                 if (snrt_is_dm_core()) {
-                    snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, FP64);
-                    snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, FP64);
+                    snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda,
+                                          FP64);
+                    snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb,
+                                          FP64);
 
                     snrt_dma_wait_all();
                 } else {
-                    // solve block already in l1, parallelize inside each cluster
-                    // gemm_cluster_kernel(alpha, beta, L1_M, L1_N, L1_K, l1_A, l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC);
+                    // solve block already in l1, parallelize inside each
+                    // cluster gemm_cluster_kernel(alpha, beta, L1_M, L1_N,
+                    // L1_K, l1_A, l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC);
 
-                    gemm(FP64, 0, true, false, false,
-                         L1_M, L1_N, L1_K, alpha,
+                    gemm(FP64, 0, true, false, false, L1_M, L1_N, L1_K, alpha,
                          l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC);
                 }
 
-                l1Id_AB = !l1Id_AB; // switch buffers
+                l1Id_AB = !l1Id_AB;  // switch buffers
                 snrt_cluster_hw_barrier();
 
                 if (snrt_is_dm_core()) {
                     if (storeC) {
                         storeC = false;
-                        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
+                        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev,
+                                               jb_prev, L1_M, L1_N, ldc, FP64);
                     }
                 }
                 kb_prev = kb;
             }
 
-            l1Id_C  = !l1Id_C; // switch buffers
+            l1Id_C = !l1Id_C;  // switch buffers
             jb_prev = jb;
             ib_prev = ib;
         }
     }
 
     if (snrt_is_dm_core()) {
-        snrt_cluster_hw_barrier(); // DMA core is one index ahead
+        snrt_cluster_hw_barrier();  // DMA core is one index ahead
 
         // store final tile
-        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
+        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N,
+                               ldc, FP64);
         snrt_dma_wait_all();
     }
 
diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c
index e5b63b351..183986970 100644
--- a/sw/blas/gemm/src/main.c
+++ b/sw/blas/gemm/src/main.c
@@ -6,8 +6,8 @@
 //         Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 #include <math.h>
-#include <stdint.h>
 #include <stdbool.h>
+#include <stdint.h>
 
 #include "snrt.h"
 
@@ -27,8 +27,8 @@ int main() {
     uint32_t ldb = N;
     uint32_t ldc = N;
 
-    gemm_oc(dtype_size, expand, setup_ssr, TA, TB, M, N, K, 1,
-            a, lda, b, ldb, BETA, c, ldc);
+    gemm_oc(dtype_size, expand, setup_ssr, TA, TB, M, N, K, 1, a, lda, b, ldb,
+            BETA, c, ldc);
 
     uint32_t end_cycle = snrt_mcycle();
 
@@ -42,8 +42,7 @@ int main() {
         for (uint32_t m = 0; m < M; m++) {
             for (uint32_t n = 0; n < N; n++) {
                 uint32_t idx = m * N + n;
-                if (fabs(result[idx] - c[idx]) < 0.001)
-                    errors--;
+                if (fabs(result[idx] - c[idx]) < 0.001) errors--;
             }
         }
         // printf("%d/%d Errors\n", errors, M * N);
@@ -135,7 +134,8 @@ int main() {
 //                             errors--;
 //                         break;
 //                     case FP32:
-//                         if (fabs(result[idx] - ((float *)local_c)[idx]) > 0.001)
+//                         if (fabs(result[idx] - ((float *)local_c)[idx]) >
+//                         0.001)
 //                             errors--;
 //                         break;
 //                     case FP16: