diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py
index d19e372a1..08b924dca 100755
--- a/sw/blas/gemm/data/datagen.py
+++ b/sw/blas/gemm/data/datagen.py
@@ -122,6 +122,7 @@ def emit_header(**kwargs):
     # gemmImpl
     data_str += ["// -- gemmImpl"]
     data_str += [f"#define USE_METHOD {gemmImpl['method']}"]
+    data_str += [f"#define USE_C2C_TILES {int(gemmImpl['use_c2c_tiles'])}"]
     data_str += [f"#define L1_M {gemmImpl['L1_M']}"]
     data_str += [f"#define L1_N {gemmImpl['L1_N']}"]
     data_str += [f"#define L1_K {gemmImpl['L1_K']}"]
diff --git a/sw/blas/gemm/data/params.hjson b/sw/blas/gemm/data/params.hjson
index 46b725ded..e4c33fe2c 100644
--- a/sw/blas/gemm/data/params.hjson
+++ b/sw/blas/gemm/data/params.hjson
@@ -21,11 +21,12 @@
     }
 
     gemmImpl: {
-        method: "baseline",
-        L1_M: 8,
-        L1_N: 8,
-        L1_K: 8,
-        ta_tile: false,
+        method: "2dpipe",
+        use_c2c_tiles: true,
+        L1_M: 16,
+        L1_N: 16,
+        L1_K: 16,
+        ta_tile: true,
         tb_tile: false,
         tc_tile: false, // not implemented
         expand: 0,
diff --git a/sw/blas/gemm/src/dma_xfer_test.h b/sw/blas/gemm/src/dma_xfer_test.h
index e55e0426f..acb7d6f03 100644
--- a/sw/blas/gemm/src/dma_xfer_test.h
+++ b/sw/blas/gemm/src/dma_xfer_test.h
@@ -1,5 +1,11 @@
 #include "gemm_decls.h"
 
+/**
+ * Test DMA effective bandwidth.
+ * Transfers an array from HBM to TCDM, 
+ * then rotates between cluster TCDMs with C2C communication, 
+ * then stores the result back to HBM.
+*/
 void dma_xfer_test(const double* A, const uint32_t N, const bool bench) {
     if (!snrt_is_dm_core()) return;
 
diff --git a/sw/blas/gemm/src/gemm_baseline.h b/sw/blas/gemm/src/gemm_baseline.h
deleted file mode 100644
index ae3e297e7..000000000
--- a/sw/blas/gemm/src/gemm_baseline.h
+++ /dev/null
@@ -1,183 +0,0 @@
-// GEMM implementation for OCCAMY without any horizontal (C2C/G2G) communication
-
-#pragma once
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "gemm.h"
-#include "snrt.h"
-
-#include "gemm_decls.h"
-
-/**
- * \brief Each cluster performs a GEMM for A, B, C inside each TCDM
- */
-void gemm_cluster_kernel(double alpha, double beta, uint32_t M, uint32_t N,
-                         uint32_t K, double* const A, double* const B,
-                         double* const C, int lda, int ldb, int ldc) {
-    uint32_t p[3], P[3];
-    ocrt_thread_idx(p);
-    ocrt_compute_thread_num(P);
-
-    for (uint32_t i = p[0]; i < M; i += P[0]) {
-        for (uint32_t j = 0; j < N; j++) {
-            uint32_t cIdx = i * ldc + j;  // C[i][j]
-            register double c0 = beta * C[cIdx];
-
-            for (uint32_t k = 0; k < K; k++) {
-                uint32_t aIdx = i * lda + k;  // A[i][k]
-                uint32_t bIdx = k * ldb + j;  // B[k][j]
-
-                c0 += A[aIdx] * B[bIdx];
-            }
-            C[cIdx] = c0;
-        }
-    }
-    snrt_fpu_fence();
-}
-
-void gemm_oc_baseline(double alpha, double beta, uint32_t m, uint32_t n,
-                      uint32_t k, double* A, double* B, double* C, uint32_t lda,
-                      uint32_t ldb, uint32_t ldc) {
-    /**
-     * Problem is double buffered in L1. The buffer that is used is toggled at
-     * each iteration. The DMA cores are one index step ahead so they load the
-     * data in advance into the buffer that will be used.
-     */
-
-    volatile uint32_t p[3] = {0, 0, 0};
-    volatile uint32_t P[3] = {0, 0, 0};
-    ocrt_thread_idx(p);
-    ocrt_compute_thread_num(P);
-
-    // Setup layout for TCDM L1
-    // For double buffering l1 is a size 2 array
-    TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next();
-
-    bool l1Id_AB = false;
-    bool l1Id_C = false;
-
-    // Initialize indices
-    const uint32_t I = m, J = n, K = k;
-
-    const uint32_t PI = P[1], PJ = 1;
-    const uint32_t pi = p[1] / PJ;
-    const uint32_t pj = p[1] % PJ;
-
-    int ib, jb, kb;
-    int ib_prev, jb_prev, kb_prev;
-    bool ib_dir = false, jb_dir = false, kb_dir = false;
-
-    bool storeC = false;
-
-    // Debug
-    volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0;
-
-    if (snrt_is_compute_core()) {
-        snrt_cluster_hw_barrier();  // DMA core is one index ahead
-    }
-
-    // FOR_EACH(ib, pi, I / L1_M, PI, ib_dir, ib_prev) {
-    ib_dir = !ib_dir;
-    const int ib_end_floor = ((I / 8 - pi + PI - 1) / PI) * PI - PI + pi;
-    const int ib_first = ib_dir ? pi : ib_end_floor;
-    const int ib_last = ib_dir ? ib_end_floor : pi;
-    ib = ib_first;
-    ib_prev = ib;
-    for (; ib_dir ? ib <= ib_last : ib >= ib_last;
-         ib = ib_dir ? ib + PI : ib - PI) {
-        ib_cnt += ib;
-        // FOR_EACH(jb, pj, J / L1_N, PJ, jb_dir, jb_prev) {
-        jb_dir = !jb_dir;
-        const int jb_end_floor = ((J / 8 - pj + PJ - 1) / PJ) * PJ - PJ + pj;
-        const int jb_first = jb_dir ? pj : jb_end_floor;
-        const int jb_last = jb_dir ? jb_end_floor : pj;
-        jb = jb_first;
-        jb_prev = jb;
-        for (; jb_dir ? jb <= jb_last : jb >= jb_last;
-             jb = jb_dir ? jb + PJ : jb - PJ) {
-            jb_cnt += jb;
-
-            double* const l1_C = l1[l1Id_C].C;
-
-            if (snrt_is_dm_core()) {
-                dump_ib(ib);
-                dump_jb(jb);
-                snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64);
-                if (ib != ib_first || jb != jb_first) storeC = true;
-            }
-
-            // FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) {
-            kb_dir = !kb_dir;
-            const int kb_end_floor = ((K / L1_K - 0 + 1 - 1) / 1) * 1 - 1 + 0;
-            const int kb_first = kb_dir ? 0 : kb_end_floor;
-            const int kb_last = kb_dir ? kb_end_floor : 0;
-            kb = kb_first;
-            kb_prev = kb;
-            for (; kb_dir ? kb <= kb_last : kb >= kb_last;
-                 kb = kb_dir ? kb + 1 : kb - 1) {
-                kb_cnt += kb;
-                double* const l1_A = l1[l1Id_AB].A;
-                double* const l1_B = l1[l1Id_AB].B;
-
-                // load next A, B
-                if (snrt_is_dm_core()) {
-                    snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda,
-                                          FP64);
-                    snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb,
-                                          FP64);
-
-                    snrt_dma_wait_all();
-                } else {
-                    // solve block already in l1, parallelize inside each
-                    // cluster gemm_cluster_kernel(alpha, beta, L1_M, L1_N,
-                    // L1_K, l1_A, l1_B, l1_C, L1_LDA, L1_LDB, L1_LDC);
-
-                    gemm(FP64, 0, true, false, false, L1_M, L1_N, L1_K, alpha,
-                         l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC);
-                }
-
-                l1Id_AB = !l1Id_AB;  // switch buffers
-                snrt_cluster_hw_barrier();
-
-                if (snrt_is_dm_core()) {
-                    if (storeC) {
-                        storeC = false;
-                        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev,
-                                               jb_prev, L1_M, L1_N, ldc, FP64);
-                    }
-                }
-                kb_prev = kb;
-            }
-
-            l1Id_C = !l1Id_C;  // switch buffers
-            jb_prev = jb;
-            ib_prev = ib;
-        }
-    }
-
-    if (snrt_is_dm_core()) {
-        snrt_cluster_hw_barrier();  // DMA core is one index ahead
-
-        // store final tile
-        snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N,
-                               ldc, FP64);
-        snrt_dma_wait_all();
-    }
-
-    // Free memory once implemented by snrt
-    // snrt_l1free(l1);
-}
-
-inline void gemm_oc(precision_t prec, uint32_t expand, uint32_t setup_ssr,
-                    uint32_t transa, uint32_t transb, uint32_t m, uint32_t n,
-                    uint32_t k, double alpha, void* a, uint32_t lda, void* b,
-                    uint32_t ldb, uint32_t beta, void* c, uint32_t ldc) {
-    // gemm_cluster_kernel(alpha, beta, m, n, k, a, b, c, lda, ldb, ldc);
-    // snrt_fpu_fence();
-    // snrt_cluster_hw_barrier();
-
-    gemm_oc_baseline(alpha, beta, m, n, k, a, b, c, lda, ldb, ldc);
-}
diff --git a/sw/blas/gemm/src/gemm_kernel_legacy.h b/sw/blas/gemm/src/gemm_kernel_legacy.h
index 9cfd42053..1b74854b5 100644
--- a/sw/blas/gemm/src/gemm_kernel_legacy.h
+++ b/sw/blas/gemm/src/gemm_kernel_legacy.h
@@ -23,6 +23,33 @@ typedef __fp16 v4f16 __attribute__((vector_size(8)));
 typedef char v8f8 __attribute__((vector_size(8)));
 #endif
 
+/**
+ * \brief Each cluster performs a GEMM for A, B, C inside each TCDM
+ */
+void gemm_cluster_kernel_baseline(double alpha, double beta, uint32_t M, uint32_t N,
+                                  uint32_t K, double* const A, double* const B,
+                                  double* const C, int lda, int ldb, int ldc) {
+    uint32_t p[3], P[3];
+    ocrt_thread_idx(p);
+    ocrt_compute_thread_num(P);
+
+    for (uint32_t i = p[0]; i < M; i += P[0]) {
+        for (uint32_t j = 0; j < N; j++) {
+            uint32_t cIdx = i * ldc + j;  // C[i][j]
+            register double c0 = beta * C[cIdx];
+
+            for (uint32_t k = 0; k < K; k++) {
+                uint32_t aIdx = i * lda + k;  // A[i][k]
+                uint32_t bIdx = k * ldb + j;  // B[k][j]
+
+                c0 += A[aIdx] * B[bIdx];
+            }
+            C[cIdx] = c0;
+        }
+    }
+    snrt_fpu_fence();
+}
+
 void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
                         uint32_t ldA, uint32_t ta, double* B, uint32_t ldB,
                         uint32_t tb, double* C, uint32_t ldC, double ALPHA) {
diff --git a/sw/blas/gemm/src/gemm_tiling_2dpipe_tpl.h b/sw/blas/gemm/src/gemm_tiling_2dpipe_tpl.h
index 6b4540f20..a84a11a13 100644
--- a/sw/blas/gemm/src/gemm_tiling_2dpipe_tpl.h
+++ b/sw/blas/gemm/src/gemm_tiling_2dpipe_tpl.h
@@ -10,8 +10,6 @@
 
 void SNBLAS_GEMM_TILING(2dpipe, FLOAT_T, IS_DM_CORE) (const SnblasGemmInfo info, const SNBLAS_GEMM_ARGS(FLOAT_T) args, const SnblasGemmImpl impl) {
 
-#define USE_C2C_TILES true
-
     /**
      * Problem is double buffered in L1. The buffer that is used is toggled at
      * each iteration. The DMA cores are one index step ahead so they load the
diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c
index 018726a86..93f6584ba 100644
--- a/sw/blas/gemm/src/main.c
+++ b/sw/blas/gemm/src/main.c
@@ -21,8 +21,6 @@
 NAMED_DUMP(uint32_t, err, 0x7)
 NAMED_DUMP(uint32_t, bench_iter, 0x7)
 
-#define BIST
-#include "data.h"
 
 int main() {
     const bool setup_ssr = true;