From c33aee3dbce5371d901e7b653c1fa7037e430607 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Tue, 3 Dec 2024 14:26:58 +0100
Subject: [PATCH] [SW] Update main files for better Cache/SPM configurations.

---
 sw/spatzBenchmarks/dp-faxpy-cache/main.c      |  2 +-
 sw/spatzBenchmarks/dp-faxpy/main.c            |  2 +-
 sw/spatzBenchmarks/dp-fdotp/main.c            | 86 ++++++++++++-------
 sw/spatzBenchmarks/dp-fft-cache/main.c        |  2 +-
 sw/spatzBenchmarks/dp-fft/main.c              |  7 +-
 sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c    | 17 +++-
 sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c    | 16 +++-
 sw/spatzBenchmarks/dp-fmatmul/main.c          | 17 +++-
 .../dp-mxfmatmul-m4n4k4-b2/main.c             |  8 +-
 .../dp-mxfmatmul-m8n4k4-b2/main.c             |  8 +-
 .../dp-mxfmatmul-m8n4k4-b4/main.c             | 10 +--
 11 files changed, 124 insertions(+), 51 deletions(-)

diff --git a/sw/spatzBenchmarks/dp-faxpy-cache/main.c b/sw/spatzBenchmarks/dp-faxpy-cache/main.c
index 0f78cbb..af4fbbf 100644
--- a/sw/spatzBenchmarks/dp-faxpy-cache/main.c
+++ b/sw/spatzBenchmarks/dp-faxpy-cache/main.c
@@ -44,7 +44,7 @@ int main() {
 
   if (cid == 0) {
     // Init the cache
-    l1d_init(32);
+    l1d_init(16);
   }
 
   // Wait for all cores to finish
diff --git a/sw/spatzBenchmarks/dp-faxpy/main.c b/sw/spatzBenchmarks/dp-faxpy/main.c
index 59c79f3..af82f3c 100644
--- a/sw/spatzBenchmarks/dp-faxpy/main.c
+++ b/sw/spatzBenchmarks/dp-faxpy/main.c
@@ -44,7 +44,7 @@ int main() {
 
   if (cid == 0) {
     // Init the cache
-    l1d_init(32);
+    l1d_init(120);
   }
 
   // Reset timer
diff --git a/sw/spatzBenchmarks/dp-fdotp/main.c b/sw/spatzBenchmarks/dp-fdotp/main.c
index 85a24ab..e59e1da 100644
--- a/sw/spatzBenchmarks/dp-fdotp/main.c
+++ b/sw/spatzBenchmarks/dp-fdotp/main.c
@@ -23,6 +23,8 @@
 #include DATAHEADER
 #include "kernel/fdotp.c"
 
+#define USE_CACHE
+
 double *a;
 double *b;
 double *result;
@@ -41,9 +43,14 @@ static inline int fp_check(const double a, const double b) {
 int main() {
   const unsigned int num_cores = snrt_cluster_core_num();
   const unsigned int cid = snrt_cluster_core_idx();
+  const int measure_iter = 1;
+
+  #ifdef USE_CACHE
+  uint32_t spm_size = 16;
+  #else
+  uint32_t spm_size = 120;
+  #endif
 
-  uint32_t spm_size = 32;
-  
   if (cid == 0) {
     // Init the cache
     l1d_init(spm_size);
@@ -54,10 +61,20 @@ int main() {
 
   // Reset timer
   unsigned int timer = (unsigned int)-1;
+  unsigned int timer_tmp = 0;
 
   const unsigned int dim = dotp_l.M / num_cores;
 
   // Allocate the matrices
+  #ifdef USE_CACHE
+  if (cid == 0) {
+    result = (double *)snrt_l1alloc(num_cores * sizeof(double));
+  }
+
+  double *a_int = dotp_A_dram + dim * cid;
+  double *b_int = dotp_B_dram + dim * cid;
+
+  #else
   if (cid == 0) {
     a = (double *)snrt_l1alloc(dotp_l.M * sizeof(double));
     b = (double *)snrt_l1alloc(dotp_l.M * sizeof(double));
@@ -78,42 +95,51 @@ int main() {
   double *a_int = a + dim * cid;
   double *b_int = b + dim * cid;
 
+  #endif
+
+
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
 
-  // Start dump
-  if (cid == 0)
-    start_kernel();
+  for (int iter = 0; iter < measure_iter; iter ++) {
+    // Start dump
+    if (cid == 0)
+      start_kernel();
 
-  // Start timer
-  if (cid == 0)
-    timer = benchmark_get_cycle();
+    // Start timer
+    if (cid == 0)
+      timer_tmp = benchmark_get_cycle();
 
-  // Calculate dotp
-  double acc;
-  acc = fdotp_v64b(a_int, b_int, dim);
-  result[cid] = acc;
+    // Calculate dotp
+    double acc;
+    acc = fdotp_v64b(a_int, b_int, dim);
+    result[cid] = acc;
 
-  // Wait for all cores to finish
-  snrt_cluster_hw_barrier();
+    // Wait for all cores to finish
+    snrt_cluster_hw_barrier();
 
-  // Final reduction
-  if (cid == 0) {
-    for (unsigned int i = 1; i < num_cores; ++i)
-      acc += result[i];
-    result[0] = acc;
-  }
+    // Final reduction
+    if (cid == 0) {
+      for (unsigned int i = 1; i < num_cores; ++i)
+        acc += result[i];
+      result[0] = acc;
+    }
 
-  // Wait for all cores to finish
-  snrt_cluster_hw_barrier();
+    // Wait for all cores to finish
+    snrt_cluster_hw_barrier();
 
-  // End dump
-  if (cid == 0)
-    stop_kernel();
+    // End dump
+    if (cid == 0)
+      stop_kernel();
 
-  // End timer and check if new best runtime
-  if (cid == 0)
-    timer = benchmark_get_cycle() - timer;
+    // End timer and check if new best runtime
+    if (cid == 0) {
+      timer_tmp = benchmark_get_cycle() - timer_tmp;
+      timer = (timer < timer_tmp) ? timer : timer_tmp;
+    }
+
+    snrt_cluster_hw_barrier();
+  }
 
   // Check and display results
   if (cid == 0) {
@@ -127,8 +153,8 @@ int main() {
   }
 
   if (cid == 0)
-    if (fp_check(result[0], dotp_result)) {
-      printf("Error: Result = %f, Golden = %f\n", result[0], dotp_result);
+    if (fp_check(result[0], dotp_result*measure_iter)) {
+      printf("Error: Result = %f, Golden = %f\n", result[0], dotp_result*measure_iter);
       return -1;
     }
 
diff --git a/sw/spatzBenchmarks/dp-fft-cache/main.c b/sw/spatzBenchmarks/dp-fft-cache/main.c
index ae3cac7..6b43092 100644
--- a/sw/spatzBenchmarks/dp-fft-cache/main.c
+++ b/sw/spatzBenchmarks/dp-fft-cache/main.c
@@ -42,7 +42,7 @@ int main() {
 
   if (cid == 0) {
     // Init the cache with half-half
-    l1d_init(32);
+    l1d_init(16);
   }
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
diff --git a/sw/spatzBenchmarks/dp-fft/main.c b/sw/spatzBenchmarks/dp-fft/main.c
index 4750812..65415fc 100644
--- a/sw/spatzBenchmarks/dp-fft/main.c
+++ b/sw/spatzBenchmarks/dp-fft/main.c
@@ -47,7 +47,7 @@ int main() {
 
   if (cid == 0) {
     // Init the cache
-    l1d_init(32);
+    l1d_init(120);
   }
 
   // log2(nfft).
@@ -66,6 +66,9 @@ int main() {
     bitrev = (uint16_t *)snrt_l1alloc((NFFT / 4) * sizeof(uint16_t));
   }
 
+  timer = benchmark_get_cycle();
+
+
   // Initialize the matrices
   if (cid == 0) {
     snrt_dma_start_1d(samples, samples_dram, 2 * NFFT * sizeof(double));
@@ -90,7 +93,7 @@ int main() {
   snrt_cluster_hw_barrier();
 
   // Start timer
-  timer = benchmark_get_cycle();
+  // timer = benchmark_get_cycle();
 
   // Start dump
   if (cid == 0)
diff --git a/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c b/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c
index 24fe207..2666e41 100644
--- a/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c
+++ b/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c
@@ -23,6 +23,8 @@
 #include DATAHEADER
 #include "kernel/dp-fmatmul.c"
 
+#define USE_CACHE
+
 #ifndef KERNEL_SIZE
 #define KERNEL_SIZE 4
 #endif
@@ -61,8 +63,12 @@ int main() {
   unsigned int m_start, m_end;
   unsigned int p_start, p_end;
   unsigned int kernel_size;
+
+  #ifdef USE_CACHE
+  uint32_t spm_size = 16;
+  #else
   uint32_t spm_size = 120;
-  
+  #endif
   if (cid == 0) {
     // Init the cache
     l1d_init(spm_size);
@@ -70,11 +76,18 @@ int main() {
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
+
   // Allocate the matrices in the local tile
   if (cid == 0) {
+  #ifdef USE_CACHE
+    a = gemm_A_dram;
+    b = gemm_B_dram;
+    c = gemm_C_dram;
+  #else
     a = (double *)snrt_l1alloc(gemm_l.M * gemm_l.K * sizeof(double));
     b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
     c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
+  #endif
   }
 
   // Reset timer
@@ -93,12 +106,14 @@ int main() {
   snrt_cluster_hw_barrier();
 
   // Initialize matrices
+  #ifndef USE_CACHE
   if (cid == 0) {
     snrt_dma_start_1d(a, gemm_A_dram, gemm_l.M * gemm_l.K * sizeof(double));
     snrt_dma_start_1d(b, gemm_B_dram, gemm_l.K * gemm_l.N * sizeof(double));
     snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
     snrt_dma_wait_all();
   }
+  #endif
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
diff --git a/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c b/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c
index 48c2118..5e6b339 100644
--- a/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c
+++ b/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c
@@ -23,6 +23,8 @@
 #include DATAHEADER
 #include "kernel/dp-fmatmul.c"
 
+// #define USE_CACHE
+
 #ifndef KERNEL_SIZE
 #define KERNEL_SIZE 8
 #endif
@@ -53,8 +55,12 @@ int verify_matrix(double *matrix, const double *checksum,
 int main() {
   const unsigned int num_cores = snrt_cluster_core_num();
   const unsigned int cid = snrt_cluster_core_idx();
+
+  #ifdef USE_CACHE
+  uint32_t spm_size = 16;
+  #else
   uint32_t spm_size = 120;
-  
+  #endif
   if (cid == 0) {
     // Init the cache
     l1d_init(spm_size);
@@ -72,9 +78,15 @@ int main() {
 
   // Allocate the matrices in the local tile
   if (cid == 0) {
+  #ifdef USE_CACHE
+    a = gemm_A_dram;
+    b = gemm_B_dram;
+    c = gemm_C_dram;
+  #else
     a = (double *)snrt_l1alloc(gemm_l.M * gemm_l.K * sizeof(double));
     b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
     c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
+  #endif
   }
 
   // Reset timer
@@ -93,12 +105,14 @@ int main() {
   snrt_cluster_hw_barrier();
 
   // Initialize matrices
+  #ifndef USE_CACHE
   if (cid == 0) {
     snrt_dma_start_1d(a, gemm_A_dram, gemm_l.M * gemm_l.K * sizeof(double));
     snrt_dma_start_1d(b, gemm_B_dram, gemm_l.K * gemm_l.N * sizeof(double));
     snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
     snrt_dma_wait_all();
   }
+  #endif
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
diff --git a/sw/spatzBenchmarks/dp-fmatmul/main.c b/sw/spatzBenchmarks/dp-fmatmul/main.c
index 007c18f..9688301 100644
--- a/sw/spatzBenchmarks/dp-fmatmul/main.c
+++ b/sw/spatzBenchmarks/dp-fmatmul/main.c
@@ -23,6 +23,8 @@
 #include DATAHEADER
 #include "kernel/dp-fmatmul.c"
 
+#define USE_CACHE
+
 double *a;
 double *b;
 double *c;
@@ -49,8 +51,13 @@ int verify_matrix(double *matrix, const double *checksum,
 int main() {
   const unsigned int num_cores = snrt_cluster_core_num();
   const unsigned int cid = snrt_cluster_core_idx();
+
+  #ifdef USE_CACHE
+  uint32_t spm_size = 32;
+  #else
   uint32_t spm_size = 120;
-  
+  #endif
+
   if (cid == 0) {
     // Init the cache
     l1d_init(spm_size);
@@ -68,9 +75,15 @@ int main() {
 
   // Allocate the matrices in the local tile
   if (cid == 0) {
+  #ifdef USE_CACHE
+    a = gemm_A_dram;
+    b = gemm_B_dram;
+    c = gemm_C_dram;
+  #else
     a = (double *)snrt_l1alloc(gemm_l.M * gemm_l.K * sizeof(double));
     b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
     c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
+  #endif
   }
 
   // Reset timer
@@ -89,12 +102,14 @@ int main() {
   snrt_cluster_hw_barrier();
 
   // Initialize matrices
+  #ifndef USE_CACHE
   if (cid == 0) {
     snrt_dma_start_1d(a, gemm_A_dram, gemm_l.M * gemm_l.K * sizeof(double));
     snrt_dma_start_1d(b, gemm_B_dram, gemm_l.K * gemm_l.N * sizeof(double));
     snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
     snrt_dma_wait_all();
   }
+  #endif
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
diff --git a/sw/spatzBenchmarks/dp-mxfmatmul-m4n4k4-b2/main.c b/sw/spatzBenchmarks/dp-mxfmatmul-m4n4k4-b2/main.c
index 0f72832..06ef570 100644
--- a/sw/spatzBenchmarks/dp-mxfmatmul-m4n4k4-b2/main.c
+++ b/sw/spatzBenchmarks/dp-mxfmatmul-m4n4k4-b2/main.c
@@ -124,7 +124,7 @@ int main() {
 #ifndef USE_CACHE
   uint32_t spm_size = 120; // 120 KB out of 128 KB
 #else
-  uint32_t spm_size = 32; // Reserve small portion for SPM
+  uint32_t spm_size = 16; // Reserve small portion for stack only
 #endif
 
   if (cid == 0) {
@@ -142,7 +142,7 @@ int main() {
     b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
     c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
   }
-#else 
+#else
   a = gemm_A_dram;
   b = gemm_B_dram;
   c = gemm_C_dram;
@@ -176,7 +176,7 @@ int main() {
     snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
     snrt_dma_wait_all();
   }
-#endif 
+#endif
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
@@ -251,7 +251,7 @@ int main() {
       for (unsigned int j = 0; j < gemm_l.N; j++) {
         checksum += c[i * gemm_l.N + j];
       }
-      printf("Checksum[%d]=%f\n", i, checksum);
+      // printf("Checksum[%d]=%f\n", i, checksum);
       double diff = checksum - (double)gemm_checksum[i];
       if (diff < 0)
         diff = -diff;
diff --git a/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b2/main.c b/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b2/main.c
index cd972d4..23b0b44 100644
--- a/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b2/main.c
+++ b/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b2/main.c
@@ -126,7 +126,7 @@ int main() {
 #else
   uint32_t spm_size = 32; // Reserve small portion for SPM
 #endif
-  
+
   if (cid == 0) {
     // Init the cache
     l1d_init(spm_size);
@@ -142,7 +142,7 @@ int main() {
     b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
     c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
   }
-#else 
+#else
   a = gemm_A_dram;
   b = gemm_B_dram;
   c = gemm_C_dram;
@@ -176,7 +176,7 @@ int main() {
     snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
     snrt_dma_wait_all();
   }
-#endif 
+#endif
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
@@ -251,7 +251,7 @@ int main() {
       for (unsigned int j = 0; j < gemm_l.N; j++) {
         checksum += c[i * gemm_l.N + j];
       }
-      printf("Checksum[%d]=%f\n", i, checksum);
+      // printf("Checksum[%d]=%f\n", i, checksum);
       double diff = checksum - (double)gemm_checksum[i];
       if (diff < 0)
         diff = -diff;
diff --git a/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b4/main.c b/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b4/main.c
index 9c1adf6..66640ba 100644
--- a/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b4/main.c
+++ b/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b4/main.c
@@ -124,9 +124,9 @@ int main() {
 #ifndef USE_CACHE
   uint32_t spm_size = 120; // 120 KB out of 128 KB
 #else
-  uint32_t spm_size = 32; // Reserve small portion for SPM
+  uint32_t spm_size = 16; // Reserve small portion for SPM
 #endif
-  
+
   if (cid == 0) {
     // Init the cache
     l1d_init(spm_size);
@@ -142,7 +142,7 @@ int main() {
     b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
     c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
   }
-#else 
+#else
   a = gemm_A_dram;
   b = gemm_B_dram;
   c = gemm_C_dram;
@@ -176,7 +176,7 @@ int main() {
     snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
     snrt_dma_wait_all();
   }
-#endif 
+#endif
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
@@ -251,7 +251,7 @@ int main() {
       for (unsigned int j = 0; j < gemm_l.N; j++) {
         checksum += c[i * gemm_l.N + j];
       }
-      printf("Checksum[%d]=%f\n", i, checksum);
+      // printf("Checksum[%d]=%f\n", i, checksum);
       double diff = checksum - (double)gemm_checksum[i];
       if (diff < 0)
         diff = -diff;