From b689b01073588d20245f251b1752f16723ee8eea Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Tue, 5 Dec 2023 18:03:43 +0100
Subject: [PATCH 01/16] Move specific snax-library out of common makefile rules

---
 runtime/Makefile.rules | 4 ----
 runtime/snax-mac.rules | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/runtime/Makefile.rules b/runtime/Makefile.rules
index 6b5490de..18a20734 100644
--- a/runtime/Makefile.rules
+++ b/runtime/Makefile.rules
@@ -15,7 +15,6 @@ MLIRTRANSLATE = mlir-translate-16
 SNAXOPT 	  = $(MAKEFILE_RULES_DIRNAME)/../compiler/snax-opt
 PYTHON        = /opt/python3.11/bin/python3
 
-CFLAGS =
 # Mixing .c and .ll files makes some flags, useful for the former,
 # unused for the latter (e.g. -I)
 CFLAGS += -Wno-unused-command-line-argument
@@ -32,7 +31,6 @@ CFLAGS += -I$(SNITCH_SW_PATH)/sw/math/src/include
 CFLAGS += -I$(SNITCH_SW_PATH)/sw/math/src/internal
 CFLAGS += -I$(SNITCH_SW_PATH)/sw/math/include/bits
 CFLAGS += -I$(SNITCH_SW_PATH)/sw/math/include
-CFLAGS += -I$(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/mac/include
 CFLAGS += -I$(MAKEFILE_RULES_DIRNAME)include
 CFLAGS += -D__DEFINED_uint64_t
 CFLAGS += -menable-experimental-extensions
@@ -45,7 +43,6 @@ CFLAGS += -fno-builtin-printf
 CFLAGS += -fno-common
 CFLAGS += -O3
 
-LDFLAGS =
 LDFLAGS += -fuse-ld=$(SNITCH_LLVM_PATH)/bin/ld.lld
 LDFLAGS += -L$(SNITCH_LLVM_PATH)/lib/clang/12.0.1/lib/
 LDFLAGS += -T$(SNITCH_SW_PATH)/sw/snRuntime/base.ld
@@ -56,7 +53,6 @@ LDFLAGS += -nostdlib
 LDFLAGS += -lclang_rt.builtins-riscv32
 LDFLAGS += -lc
 LDFLAGS += -lsnRuntime
-LDFLAGS += $(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/mac/build/mac.o
 
 # useful for debugging at llvm level:
 %.ll: %.c
diff --git a/runtime/snax-mac.rules b/runtime/snax-mac.rules
index c1da31cf..477d09d1 100644
--- a/runtime/snax-mac.rules
+++ b/runtime/snax-mac.rules
@@ -1,3 +1,5 @@
 # Specific settings for snax-mac RTL
 SNITCH_SW_PATH = /opt/snax-mac
 VLTSIM        = /opt/snax-mac-rtl/bin/snitch_cluster.vlt
+CFLAGS += -I$(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/mac/include
+LDFLAGS += $(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/mac/build/mac.o

From 27735a4dfc7e3cb85cafeca842c1661f6ffb3efa Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Tue, 5 Dec 2023 18:39:05 +0100
Subject: [PATCH 02/16] Add 2D variants of i8 and i32 memrefs

---
 runtime/include/memref.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/runtime/include/memref.h b/runtime/include/memref.h
index f4e9e2ce..c7742617 100644
--- a/runtime/include/memref.h
+++ b/runtime/include/memref.h
@@ -12,4 +12,26 @@ struct OneDMemrefI32 {
   uint32_t stride[1];
 };
 
+struct TwoDMemrefI32 {
+  int32_t *data; // allocated pointer: Pointer to data buffer as allocated,
+                 // only used for deallocating the memref
+  int32_t *aligned_data; // aligned pointer: Pointer to properly aligned data
+                         // that memref indexes
+  uint32_t offset;
+  uint32_t shape[2];
+  uint32_t stride[2];
+};
+
+struct TwoDMemrefI8 {
+  int8_t *data; // allocated pointer: Pointer to data buffer as allocated,
+                // only used for deallocating the memref
+  int8_t *aligned_data; // aligned pointer: Pointer to properly aligned data
+                        // that memref indexes
+  uint32_t offset;
+  uint32_t shape[2];
+  uint32_t stride[2];
+};
+
 typedef struct OneDMemrefI32 OneDMemrefI32_t;
+typedef struct TwoDMemrefI8 TwoDMemrefI8_t;
+typedef struct TwoDMemrefI32 TwoDMemrefI32_t;

From 09876df7276ae945a0d2a2804147ce0b6dff71d8 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Tue, 5 Dec 2023 18:40:04 +0100
Subject: [PATCH 03/16] Add snax-gemm makefile rules

---
 runtime/snax-gemm.rules | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 runtime/snax-gemm.rules

diff --git a/runtime/snax-gemm.rules b/runtime/snax-gemm.rules
new file mode 100644
index 00000000..57211d49
--- /dev/null
+++ b/runtime/snax-gemm.rules
@@ -0,0 +1,5 @@
+# Specific settings for snax-mac RTL
+SNITCH_SW_PATH = /opt/snax-gemm
+VLTSIM        = /opt/snax-gemm-rtl/bin/snitch_cluster.vlt
+CFLAGS += -I$(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/gemm/include
+LDFLAGS += $(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/gemm/build/snax-gemm-lib.o

From 9ca258c6f1cdf98a1bf9baed5d6e1a48c3b726a4 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Tue, 5 Dec 2023 18:41:30 +0100
Subject: [PATCH 04/16] Add WIP version of baseline

---
 kernels/simple_matmul/Makefile   | 34 ++++++++++++++++
 kernels/simple_matmul/baseline.c | 31 ++++++++++++++
 kernels/simple_matmul/data.c     | 10 +++++
 kernels/simple_matmul/data.h     |  9 ++++
 kernels/simple_matmul/main.c     | 70 ++++++++++++++++++++++++++++++++
 5 files changed, 154 insertions(+)
 create mode 100644 kernels/simple_matmul/Makefile
 create mode 100644 kernels/simple_matmul/baseline.c
 create mode 100644 kernels/simple_matmul/data.c
 create mode 100644 kernels/simple_matmul/data.h
 create mode 100644 kernels/simple_matmul/main.c

diff --git a/kernels/simple_matmul/Makefile b/kernels/simple_matmul/Makefile
new file mode 100644
index 00000000..c0bb66ea
--- /dev/null
+++ b/kernels/simple_matmul/Makefile
@@ -0,0 +1,34 @@
+# Courtesy of Federico Ficarelli
+
+.DEFAULT_GOAL := all
+
+include ../../runtime/snax-gemm.rules
+include ../../runtime/Makefile.rules
+
+TESTS =
+TESTS += baseline.x
+TESTS += linalg.x
+
+CFLAGS += -std=gnu11
+CFLAGS += -Wall -Wextra
+
+data.c data.h:
+	$(PYTHON) gendata.py
+
+%.x: %.o main.o data.o
+	$(LD) $(LDFLAGS) $^ -o $@
+
+sim_%: %
+	rm -fr ./logs/
+	$(VLTSIM) $<
+
+RUN = $(addprefix run_, $(TESTS))
+$(RUN): run_%: sim_%
+	mv logs $(subst sim_,,$<).logs
+
+all: $(TESTS)
+
+allrun: $(RUN)
+
+clean:
+	rm -fr *.ll12 *.x *.o *.logs/ logs/ data.h data.c
diff --git a/kernels/simple_matmul/baseline.c b/kernels/simple_matmul/baseline.c
new file mode 100644
index 00000000..af919129
--- /dev/null
+++ b/kernels/simple_matmul/baseline.c
@@ -0,0 +1,31 @@
+// #include "data.h"
+#include "memref.h"
+#include "snax-gemm-lib.h"
+
+#include <snrt.h>
+
+#include <stdint.h>
+
+void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *A, TwoDMemrefI8_t *B,
+                                TwoDMemrefI32_t *C) {
+  uint8_t Batch = 1;
+  uint8_t M = (uint8_t)A->shape[0];
+  uint8_t K = (uint8_t)A->shape[1];
+  uint8_t N = (uint8_t)B->shape[1];
+  int8_t *start_addr_a = A->aligned_data;
+  int8_t *start_addr_b = B->aligned_data;
+  int32_t *start_addr_c = C->aligned_data;
+  // TODO extract parameters below from memref?
+  uint32_t strideInnermostA = 256;
+  uint32_t strideInnermostB = 256;
+  uint32_t strideInnermostC = 256;
+  uint32_t ldA = 2048;
+  uint32_t ldB = 2048;
+  uint32_t ldC = 1024;
+  uint32_t strideA = 0;
+  uint32_t strideB = 0;
+  uint32_t strideC = 0;
+  batch_gemm_cpu(Batch, M, K, N, start_addr_a, start_addr_b, start_addr_c,
+                 strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB,
+                 ldC, strideA, strideB, strideC);
+}
diff --git a/kernels/simple_matmul/data.c b/kernels/simple_matmul/data.c
new file mode 100644
index 00000000..2d104892
--- /dev/null
+++ b/kernels/simple_matmul/data.c
@@ -0,0 +1,10 @@
+#include "data.h"
+
+const int8_t A[N_size * K_size] = {44,  -81, -11, 64,   -61,
+                                   123, 67,  -25, -119, 83};
+
+const int8_t B[K_size * M_size] = {-107, 114, -92, -41, -58,
+                                   88,   -40, 12,  -70, 65};
+
+const int32_t G[N_size * M_size] = {-4708, -9234, 1012, -2624, 3538,
+                                    10824, -2680, -300, 8330,  5395};
diff --git a/kernels/simple_matmul/data.h b/kernels/simple_matmul/data.h
new file mode 100644
index 00000000..4154c644
--- /dev/null
+++ b/kernels/simple_matmul/data.h
@@ -0,0 +1,9 @@
+#include <stdint.h>
+#pragma once
+
+#define N_size 16
+#define K_size 16
+#define M_size 16
+extern const int8_t A[N_size * K_size];
+extern const int8_t B[K_size * M_size];
+extern const int32_t G[N_size * M_size];
diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c
new file mode 100644
index 00000000..afad824d
--- /dev/null
+++ b/kernels/simple_matmul/main.c
@@ -0,0 +1,70 @@
+#include "data.h"
+#include "memref.h"
+#include "snax-gemm-lib.h"
+#include "snax_rt.h"
+#include "stdint.h"
+
+#include <snrt.h>
+#include <stdint.h>
+
+// Kernel provided via external definition
+void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
+                                TwoDMemrefI32_t *c);
+
+// void _mlir_ciface_snax_hwpe_mult(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
+//                                 TwoDMemrefI32_t *c) {
+//
+//   set_batch_gemm(a->aligned_data, b->aligned_data, c->aligned_data,
+//                              a->shape[0]);
+//   start_batch_gemm();
+//   wait_batch_gemm();
+// }
+
+int main() {
+
+  // Create memref objects for data stored in L1
+  TwoDMemrefI8_t memrefA;
+  memrefA.data = &A;
+  memrefA.aligned_data = memrefA.data;
+  memrefA.offset = 0;
+  memrefA.shape[0] = M_size;
+  memrefA.shape[1] = K_size;
+  memrefA.stride[0] = sizeof(int8_t);
+  memrefA.stride[1] = sizeof(int8_t);
+
+  TwoDMemrefI8_t memrefB;
+  memrefB.data = &B;
+  memrefB.aligned_data = memrefB.data;
+  memrefA.offset = 0;
+  memrefA.shape[0] = K_size;
+  memrefA.shape[1] = N_size;
+  memrefA.stride[0] = sizeof(int8_t);
+  memrefA.stride[1] = sizeof(int8_t);
+
+  TwoDMemrefI32_t memrefC;
+  memrefC.data = &G;
+  memrefC.aligned_data = memrefC.data;
+  memrefC.offset = 0;
+  memrefC.shape[0] = M_size;
+  memrefC.shape[1] = N_size;
+  memrefC.stride[0] = sizeof(int32_t);
+  memrefC.stride[1] = sizeof(int32_t);
+
+  (void)snrt_mcycle();
+  _mlir_ciface_simple_matmul(&memrefA, &memrefB, &memrefC);
+  (void)snrt_mcycle();
+
+  // Correctness check -
+  // from this point on only core 0 is required to be alive.
+  int thiscore = snrt_cluster_core_idx();
+  if (thiscore != 0)
+    return 0;
+
+  int nerr = 0;
+  for (int i = 0; i < M_size * N_size; i++) {
+    int32_t error = memrefC.aligned_data[i] - G[i];
+    if (error != 0)
+      nerr += 1;
+  }
+  return nerr;
+}

From 3c5a06a8433a93e6637c7cf2ae023787f76d6167 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Tue, 12 Dec 2023 11:27:14 +0100
Subject: [PATCH 05/16] Add automatic data generation

---
 kernels/simple_matmul/Makefile   |  1 -
 kernels/simple_matmul/baseline.c | 12 +++----
 kernels/simple_matmul/data.c     | 10 ------
 kernels/simple_matmul/data.h     |  9 -----
 kernels/simple_matmul/gendata.py | 58 ++++++++++++++++++++++++++++++++
 kernels/simple_matmul/main.c     |  4 +--
 6 files changed, 66 insertions(+), 28 deletions(-)
 delete mode 100644 kernels/simple_matmul/data.c
 delete mode 100644 kernels/simple_matmul/data.h
 create mode 100755 kernels/simple_matmul/gendata.py

diff --git a/kernels/simple_matmul/Makefile b/kernels/simple_matmul/Makefile
index c0bb66ea..5f3adbf1 100644
--- a/kernels/simple_matmul/Makefile
+++ b/kernels/simple_matmul/Makefile
@@ -7,7 +7,6 @@ include ../../runtime/Makefile.rules
 
 TESTS =
 TESTS += baseline.x
-TESTS += linalg.x
 
 CFLAGS += -std=gnu11
 CFLAGS += -Wall -Wextra
diff --git a/kernels/simple_matmul/baseline.c b/kernels/simple_matmul/baseline.c
index af919129..fb2ee3a2 100644
--- a/kernels/simple_matmul/baseline.c
+++ b/kernels/simple_matmul/baseline.c
@@ -9,9 +9,9 @@
 void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *A, TwoDMemrefI8_t *B,
                                 TwoDMemrefI32_t *C) {
   uint8_t Batch = 1;
-  uint8_t M = (uint8_t)A->shape[0];
-  uint8_t K = (uint8_t)A->shape[1];
-  uint8_t N = (uint8_t)B->shape[1];
+  uint8_t M_size = (uint8_t)A->shape[0];
+  uint8_t K_size = (uint8_t)A->shape[1];
+  uint8_t N_size = (uint8_t)B->shape[1];
   int8_t *start_addr_a = A->aligned_data;
   int8_t *start_addr_b = B->aligned_data;
   int32_t *start_addr_c = C->aligned_data;
@@ -25,7 +25,7 @@ void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *A, TwoDMemrefI8_t *B,
   uint32_t strideA = 0;
   uint32_t strideB = 0;
   uint32_t strideC = 0;
-  batch_gemm_cpu(Batch, M, K, N, start_addr_a, start_addr_b, start_addr_c,
-                 strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB,
-                 ldC, strideA, strideB, strideC);
+  batch_gemm_cpu(Batch, M_size, K_size, N_size, start_addr_a, start_addr_b,
+                 start_addr_c, strideInnermostA, strideInnermostB,
+                 strideInnermostC, ldA, ldB, ldC, strideA, strideB, strideC);
 }
diff --git a/kernels/simple_matmul/data.c b/kernels/simple_matmul/data.c
deleted file mode 100644
index 2d104892..00000000
--- a/kernels/simple_matmul/data.c
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "data.h"
-
-const int8_t A[N_size * K_size] = {44,  -81, -11, 64,   -61,
-                                   123, 67,  -25, -119, 83};
-
-const int8_t B[K_size * M_size] = {-107, 114, -92, -41, -58,
-                                   88,   -40, 12,  -70, 65};
-
-const int32_t G[N_size * M_size] = {-4708, -9234, 1012, -2624, 3538,
-                                    10824, -2680, -300, 8330,  5395};
diff --git a/kernels/simple_matmul/data.h b/kernels/simple_matmul/data.h
deleted file mode 100644
index 4154c644..00000000
--- a/kernels/simple_matmul/data.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#include <stdint.h>
-#pragma once
-
-#define N_size 16
-#define K_size 16
-#define M_size 16
-extern const int8_t A[N_size * K_size];
-extern const int8_t B[K_size * M_size];
-extern const int32_t G[N_size * M_size];
diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py
new file mode 100755
index 00000000..f5304f5c
--- /dev/null
+++ b/kernels/simple_matmul/gendata.py
@@ -0,0 +1,58 @@
+# simple script to generate inputs and expected outputs for simple_matmult
+import numpy as np
+from numpy import typing as npt
+from typing import Dict
+
+
+def create_header(
+    file_name: str, sizes: Dict[str, int], variables: Dict[str, npt.NDArray]
+) -> None:
+    with open(file_name, "w") as f:
+        includes = ["#include <stdint.h>", "#pragma once", ""]
+        includes = "\n".join(includes)
+        variables_string = [""]
+        for i, j in sizes.items():
+            variables_string.append(f"#define {i} {j}")
+        variables_string.append("")
+        for i, j in variables.items():
+            variables_string.append(f"extern const {j.dtype}_t {i}[{j.size}];")
+        variables_string = "\n".join(variables_string)
+        f.write(includes)
+        f.write(variables_string)
+        f.write("\n")
+
+
+def create_data(file_name: str, variables: Dict[str, npt.NDArray]):
+    includes = ['#include "data.h"', "", ""]
+    includes = "\n".join(includes)
+    variables = {i: np.reshape(j, j.size) for i, j in variables.items()}
+    with open(file_name, "w") as f:
+        f.write(includes)
+        for variable_name, variable_value in variables.items():
+            f.write(
+                f"const {variable_value.dtype}_t {variable_name}"
+                + f"[{variable_value.size}] = "
+                + "{\n"
+            )
+            variable_str = ["\t" + str(i) for i in variable_value]
+            f.write(",\n".join(variable_str))
+            f.write("\n};\n\n")
+
+
+if __name__ == "__main__":
+    # Reset random seed for reproducible behavior
+    low_bound = -128
+    high_bound = 127
+    A_size = [16, 16]
+    B_size = [16, 32]
+    np.random.seed(0)
+    # G = A*B
+    A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8"))
+    B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8"))
+    C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32")))
+    C = np.zeros(C_golden.shape, np.dtype("int32"))
+    variables = {"A": A, "B": B, "C_golden": C_golden, "C": C}
+    assert A.shape[1] == B.shape[0]
+    sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]}
+    create_header("data.h", sizes, variables)
+    create_data("data.c", variables)
diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c
index afad824d..e6e8b225 100644
--- a/kernels/simple_matmul/main.c
+++ b/kernels/simple_matmul/main.c
@@ -42,7 +42,7 @@ int main() {
   memrefA.stride[1] = sizeof(int8_t);
 
   TwoDMemrefI32_t memrefC;
-  memrefC.data = &G;
+  memrefC.data = &C;
   memrefC.aligned_data = memrefC.data;
   memrefC.offset = 0;
   memrefC.shape[0] = M_size;
@@ -62,7 +62,7 @@ int main() {
 
   int nerr = 0;
   for (int i = 0; i < M_size * N_size; i++) {
-    int32_t error = memrefC.aligned_data[i] - G[i];
+    int32_t error = memrefC.aligned_data[i] - C_golden[i];
     if (error != 0)
       nerr += 1;
   }

From c6c5142e2bae3ba74c38f2d9d625f61b5ba65aa4 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Wed, 13 Dec 2023 09:45:31 +0100
Subject: [PATCH 06/16] Add caller from mlir

---
 kernels/simple_matmul/Makefile   |  2 +-
 kernels/simple_matmul/baseline.c | 31 -------------------------------
 kernels/simple_matmul/cpu.mlir   |  7 +++++++
 3 files changed, 8 insertions(+), 32 deletions(-)
 delete mode 100644 kernels/simple_matmul/baseline.c
 create mode 100644 kernels/simple_matmul/cpu.mlir

diff --git a/kernels/simple_matmul/Makefile b/kernels/simple_matmul/Makefile
index 5f3adbf1..ac165f01 100644
--- a/kernels/simple_matmul/Makefile
+++ b/kernels/simple_matmul/Makefile
@@ -6,7 +6,7 @@ include ../../runtime/snax-gemm.rules
 include ../../runtime/Makefile.rules
 
 TESTS =
-TESTS += baseline.x
+TESTS += cpu.x
 
 CFLAGS += -std=gnu11
 CFLAGS += -Wall -Wextra
diff --git a/kernels/simple_matmul/baseline.c b/kernels/simple_matmul/baseline.c
deleted file mode 100644
index fb2ee3a2..00000000
--- a/kernels/simple_matmul/baseline.c
+++ /dev/null
@@ -1,31 +0,0 @@
-// #include "data.h"
-#include "memref.h"
-#include "snax-gemm-lib.h"
-
-#include <snrt.h>
-
-#include <stdint.h>
-
-void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *A, TwoDMemrefI8_t *B,
-                                TwoDMemrefI32_t *C) {
-  uint8_t Batch = 1;
-  uint8_t M_size = (uint8_t)A->shape[0];
-  uint8_t K_size = (uint8_t)A->shape[1];
-  uint8_t N_size = (uint8_t)B->shape[1];
-  int8_t *start_addr_a = A->aligned_data;
-  int8_t *start_addr_b = B->aligned_data;
-  int32_t *start_addr_c = C->aligned_data;
-  // TODO extract parameters below from memref?
-  uint32_t strideInnermostA = 256;
-  uint32_t strideInnermostB = 256;
-  uint32_t strideInnermostC = 256;
-  uint32_t ldA = 2048;
-  uint32_t ldB = 2048;
-  uint32_t ldC = 1024;
-  uint32_t strideA = 0;
-  uint32_t strideB = 0;
-  uint32_t strideC = 0;
-  batch_gemm_cpu(Batch, M_size, K_size, N_size, start_addr_a, start_addr_b,
-                 start_addr_c, strideInnermostA, strideInnermostB,
-                 strideInnermostC, ldA, ldB, ldC, strideA, strideB, strideC);
-}
diff --git a/kernels/simple_matmul/cpu.mlir b/kernels/simple_matmul/cpu.mlir
new file mode 100644
index 00000000..8c6b59a6
--- /dev/null
+++ b/kernels/simple_matmul/cpu.mlir
@@ -0,0 +1,7 @@
+func.func public @simple_matmul(%A: memref<16x16xi8, 1 : i32>,
+                                %B: memref<16x32xi8, 1 : i32>,
+                                %C: memref<16x32xi32, 1 : i32>) -> () {
+  func.call @simple_matmul_cpu(%A, %B, %C) : (memref<16x16xi8, 1 : i32>, memref<16x32xi8, 1 : i32>, memref<16x32xi32, 1 : i32>) -> ()
+  return
+} 
+func.func private @simple_matmul_cpu(%A : memref<16x16xi8, 1 : i32>, %B : memref<16x32xi8, 1 : i32>, %C : memref<16x32xi32, 1 : i32>)

From 360c183259064caf081932c952b8c3dca560dcb8 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Wed, 13 Dec 2023 11:08:03 +0100
Subject: [PATCH 07/16] Add function to setup CPU kernel

---
 kernels/simple_matmul/main.c | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c
index e6e8b225..b4eb7f31 100644
--- a/kernels/simple_matmul/main.c
+++ b/kernels/simple_matmul/main.c
@@ -1,6 +1,7 @@
 #include "data.h"
 #include "memref.h"
 #include "snax-gemm-lib.h"
+#include "snax-gemm-params.h"
 #include "snax_rt.h"
 #include "stdint.h"
 
@@ -11,14 +12,32 @@
 void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
                                 TwoDMemrefI32_t *c);
 
-// void _mlir_ciface_snax_hwpe_mult(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
-//                                 TwoDMemrefI32_t *c) {
-//
-//   set_batch_gemm(a->aligned_data, b->aligned_data, c->aligned_data,
-//                              a->shape[0]);
-//   start_batch_gemm();
-//   wait_batch_gemm();
-// }
+void _mlir_ciface_simple_matmul_cpu(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
+                                    TwoDMemrefI32_t *c) {
+  uint8_t Batch = 1;
+  // meshRow, tileSize and meshCol are defined in snax-gemm-params.h
+  uint8_t M_param = M_size / meshRow;
+  uint8_t K_param = K_size / tileSize;
+  uint8_t N_param = N_size / meshCol;
+  int8_t *A_ptr = a->aligned_data;
+  int8_t *B_ptr = b->aligned_data;
+  int32_t *C_ptr = c->aligned_data;
+  // Extracted from datagen.py in snitch_cluster repo
+  uint32_t strideInnermostA = 256;
+  uint32_t strideInnermostB = 256;
+  uint32_t strideInnermostC = 256;
+  uint32_t ldA = 2048;
+  uint32_t ldB = 2048;
+  uint32_t ldC = 1024;
+  uint32_t strideA = 0;
+  uint32_t strideB = 0;
+  uint32_t strideC = 0;
+  // delta_local_a: 64,
+  // delta_local_b: 8192
+  batch_gemm_cpu(Batch, M_param, K_param, N_param, A_ptr, B_ptr, C_ptr,
+                 strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB,
+                 ldC, strideA, strideB, strideC);
+}
 
 int main() {
 

From ae7481c3e58638eb84b264cd609aeba6104319f3 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Thu, 14 Dec 2023 10:12:25 +0100
Subject: [PATCH 08/16] Add a first version of allocation

---
 kernels/simple_matmul/main.c | 90 +++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 26 deletions(-)

diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c
index b4eb7f31..58bfd86b 100644
--- a/kernels/simple_matmul/main.c
+++ b/kernels/simple_matmul/main.c
@@ -8,66 +8,104 @@
 #include <snrt.h>
 #include <stdint.h>
 
+uint8_t Batch = 1;
+// meshRow, tileSize and meshCol are defined in snax-gemm-params.h
+uint8_t M_param = M_size / meshRow;
+uint8_t K_param = K_size / tileSize;
+uint8_t N_param = N_size / meshCol;
+// Extracted from datagen.py in snitch_cluster repo
+uint32_t strideInnermostA = 256;
+uint32_t strideInnermostB = 256;
+uint32_t strideInnermostC = 256;
+uint32_t ldA = 2048;
+uint32_t ldB = 2048;
+uint32_t ldC = 1024;
+uint32_t strideA = 0;
+uint32_t strideB = 0;
+uint32_t strideC = 0;
+
 // Kernel provided via external definition
 void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
                                 TwoDMemrefI32_t *c);
 
 void _mlir_ciface_simple_matmul_cpu(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
                                     TwoDMemrefI32_t *c) {
-  uint8_t Batch = 1;
-  // meshRow, tileSize and meshCol are defined in snax-gemm-params.h
-  uint8_t M_param = M_size / meshRow;
-  uint8_t K_param = K_size / tileSize;
-  uint8_t N_param = N_size / meshCol;
   int8_t *A_ptr = a->aligned_data;
   int8_t *B_ptr = b->aligned_data;
   int32_t *C_ptr = c->aligned_data;
-  // Extracted from datagen.py in snitch_cluster repo
-  uint32_t strideInnermostA = 256;
-  uint32_t strideInnermostB = 256;
-  uint32_t strideInnermostC = 256;
-  uint32_t ldA = 2048;
-  uint32_t ldB = 2048;
-  uint32_t ldC = 1024;
-  uint32_t strideA = 0;
-  uint32_t strideB = 0;
-  uint32_t strideC = 0;
-  // delta_local_a: 64,
-  // delta_local_b: 8192
   batch_gemm_cpu(Batch, M_param, K_param, N_param, A_ptr, B_ptr, C_ptr,
                  strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB,
                  ldC, strideA, strideB, strideC);
 }
 
 int main() {
+  // Allocate space in TCDM
+  // We put the data in different banks, but we don't interleave the data for
+  // now.
+  //
+  //  | A | x | x | x |  --> A in banks 0 - 7  --> (8/32 banks used)*
+  //                                               (int8 --> 8 elements/bank)
+  //                                               1 row --> 64 elements
+  //  | x | B | x | x |  --> B in banks 7 - 15 --> (8/32 banks used)*
+  //                                               (8 elements/bank)*32 banks
+  //                                               1 row --> 64 elements
+  //  | C | C | C | C |  --> C in banks 0 - 31 --> (32/32 banks used)*
+  //                                               (2 elements/bank)* 32 bank
+  //                                               1 row --> 64 elements
+  //  | x | x | x | x |
+  //
+  //  32 banks -->  1 row = 32 banks * 8 bytes --> 256 adresses further
+
+  static int8_t *allocated_a;
+  static int8_t *allocated_b;
+  static int32_t *allocated_c;
+
+  // Transfer data from L3 to L1
+  // Using DMA only
+  if (snrt_is_dm_core()) {
+    // calculation in bytes directly
+    allocated_a = (int8_t *)snrt_l1alloc(256 * M_size * K_size / 64);
+    allocated_b = (int8_t *)snrt_l1alloc(256 * K_size * N_size / 64);
+    allocated_c = (int32_t *)snrt_l1alloc(256 * M_size * K_size / 64);
+  }
 
-  // Create memref objects for data stored in L1
+  // Create memref descriptors for data stored in L1
   TwoDMemrefI8_t memrefA;
   memrefA.data = &A;
   memrefA.aligned_data = memrefA.data;
-  memrefA.offset = 0;
   memrefA.shape[0] = M_size;
   memrefA.shape[1] = K_size;
+  // These are not considered correctly right now
+  memrefA.offset = 0;
   memrefA.stride[0] = sizeof(int8_t);
   memrefA.stride[1] = sizeof(int8_t);
 
   TwoDMemrefI8_t memrefB;
   memrefB.data = &B;
-  memrefB.aligned_data = memrefB.data;
-  memrefA.offset = 0;
-  memrefA.shape[0] = K_size;
-  memrefA.shape[1] = N_size;
-  memrefA.stride[0] = sizeof(int8_t);
-  memrefA.stride[1] = sizeof(int8_t);
+  // Data is stored in banks 8 - 15, so increment by 8banks*8bytes = 64
+  memrefB.aligned_data = memrefB.data + 64;
+  memrefB.shape[0] = K_size;
+  memrefB.shape[1] = N_size;
+  // These are not considered correctly right now
+  memrefB.offset = 0;
+  memrefB.stride[0] = sizeof(int8_t);
+  memrefB.stride[1] = sizeof(int8_t);
 
   TwoDMemrefI32_t memrefC;
   memrefC.data = &C;
   memrefC.aligned_data = memrefC.data;
-  memrefC.offset = 0;
   memrefC.shape[0] = M_size;
   memrefC.shape[1] = N_size;
+  // These are not considered correctly right now
+  memrefC.offset = 0;
   memrefC.stride[0] = sizeof(int32_t);
   memrefC.stride[1] = sizeof(int32_t);
+  if (snrt_is_dm_core()) {
+    load_input_data(Batch, M_size / meshRow, K_size / tileSize,
+                    N_size / meshCol, memrefA.aligned_data,
+                    memrefB.aligned_data, A, B, strideInnermostA,
+                    strideInnermostB, ldA, ldB, strideA, strideB);
+  }
 
   (void)snrt_mcycle();
   _mlir_ciface_simple_matmul(&memrefA, &memrefB, &memrefC);

From 9c0dfb96ddb9e90cee0f063abbe1525aaed30bb4 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Thu, 14 Dec 2023 11:54:11 +0100
Subject: [PATCH 09/16] Add working example for all ones

---
 kernels/simple_matmul/gendata.py |  4 +++-
 kernels/simple_matmul/main.c     | 31 ++++++++++++++++++-------------
 kernels/simple_mult/main.c       |  1 +
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py
index f5304f5c..0819f677 100755
--- a/kernels/simple_matmul/gendata.py
+++ b/kernels/simple_matmul/gendata.py
@@ -44,11 +44,13 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]):
     low_bound = -128
     high_bound = 127
     A_size = [16, 16]
-    B_size = [16, 32]
+    B_size = [16, 16]
     np.random.seed(0)
     # G = A*B
     A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8"))
+    A = np.ones(A_size, dtype=np.dtype("int8"))
     B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8"))
+    B = np.ones(B_size, dtype=np.dtype("int8"))
     C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32")))
     C = np.zeros(C_golden.shape, np.dtype("int32"))
     variables = {"A": A, "B": B, "C_golden": C_golden, "C": C}
diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c
index 58bfd86b..0b284897 100644
--- a/kernels/simple_matmul/main.c
+++ b/kernels/simple_matmul/main.c
@@ -17,9 +17,9 @@ uint8_t N_param = N_size / meshCol;
 uint32_t strideInnermostA = 256;
 uint32_t strideInnermostB = 256;
 uint32_t strideInnermostC = 256;
-uint32_t ldA = 2048;
-uint32_t ldB = 2048;
-uint32_t ldC = 1024;
+uint32_t ldA = 512;
+uint32_t ldB = 512;
+uint32_t ldC = 512;
 uint32_t strideA = 0;
 uint32_t strideB = 0;
 uint32_t strideC = 0;
@@ -30,10 +30,10 @@ void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
 
 void _mlir_ciface_simple_matmul_cpu(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b,
                                     TwoDMemrefI32_t *c) {
-  int8_t *A_ptr = a->aligned_data;
-  int8_t *B_ptr = b->aligned_data;
-  int32_t *C_ptr = c->aligned_data;
-  batch_gemm_cpu(Batch, M_param, K_param, N_param, A_ptr, B_ptr, C_ptr,
+  int8_t *a_ptr = a->aligned_data;
+  int8_t *b_ptr = b->aligned_data;
+  int32_t *c_ptr = c->aligned_data;
+  batch_gemm_cpu(Batch, M_param, K_param, N_param, a_ptr, b_ptr, c_ptr,
                  strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB,
                  ldC, strideA, strideB, strideC);
 }
@@ -66,12 +66,13 @@ int main() {
     // calculation in bytes directly
     allocated_a = (int8_t *)snrt_l1alloc(256 * M_size * K_size / 64);
     allocated_b = (int8_t *)snrt_l1alloc(256 * K_size * N_size / 64);
-    allocated_c = (int32_t *)snrt_l1alloc(256 * M_size * K_size / 64);
+    allocated_c = (int32_t *)snrt_l1alloc(256 * M_size * N_size / 64);
   }
+  snrt_cluster_hw_barrier();
 
   // Create memref descriptors for data stored in L1
   TwoDMemrefI8_t memrefA;
-  memrefA.data = &A;
+  memrefA.data = allocated_a;
   memrefA.aligned_data = memrefA.data;
   memrefA.shape[0] = M_size;
   memrefA.shape[1] = K_size;
@@ -81,7 +82,7 @@ int main() {
   memrefA.stride[1] = sizeof(int8_t);
 
   TwoDMemrefI8_t memrefB;
-  memrefB.data = &B;
+  memrefB.data = allocated_b;
   // Data is stored in banks 8 - 15, so increment by 8banks*8bytes = 64
   memrefB.aligned_data = memrefB.data + 64;
   memrefB.shape[0] = K_size;
@@ -92,7 +93,7 @@ int main() {
   memrefB.stride[1] = sizeof(int8_t);
 
   TwoDMemrefI32_t memrefC;
-  memrefC.data = &C;
+  memrefC.data = allocated_c;
   memrefC.aligned_data = memrefC.data;
   memrefC.shape[0] = M_size;
   memrefC.shape[1] = N_size;
@@ -106,10 +107,13 @@ int main() {
                     memrefB.aligned_data, A, B, strideInnermostA,
                     strideInnermostB, ldA, ldB, strideA, strideB);
   }
-
+  snrt_cluster_hw_barrier();
   (void)snrt_mcycle();
-  _mlir_ciface_simple_matmul(&memrefA, &memrefB, &memrefC);
+  if (snrt_is_compute_core()) {
+    _mlir_ciface_simple_matmul(&memrefA, &memrefB, &memrefC);
+  }
   (void)snrt_mcycle();
+  snrt_cluster_hw_barrier();
 
   // Correctness check -
   // from this point on only core 0 is required to be alive.
@@ -119,6 +123,7 @@ int main() {
 
   int nerr = 0;
   for (int i = 0; i < M_size * N_size; i++) {
+    // printf("%d , golden : %d\n", memrefC.aligned_data[i],C_golden[i]);
     int32_t error = memrefC.aligned_data[i] - C_golden[i];
     if (error != 0)
       nerr += 1;
diff --git a/kernels/simple_mult/main.c b/kernels/simple_mult/main.c
index ffbd75de..6fdc7181 100644
--- a/kernels/simple_mult/main.c
+++ b/kernels/simple_mult/main.c
@@ -55,6 +55,7 @@ int main() {
 
   int nerr = 0;
   for (int i = 0; i < N; i++) {
+    printf("result: %d golden: %d\n", memrefD.aligned_data[i], G[i]);
     int32_t error = memrefD.aligned_data[i] - G[i];
     if (error != 0)
       nerr += 1;

From d8c31be3f80eef81e6c05bbd8fa851d5dca9985f Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Thu, 14 Dec 2023 12:58:44 +0100
Subject: [PATCH 10/16] Add data layout transformation to golden model

---
 kernels/simple_matmul/gendata.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py
index 0819f677..10e6700f 100755
--- a/kernels/simple_matmul/gendata.py
+++ b/kernels/simple_matmul/gendata.py
@@ -48,10 +48,18 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]):
     np.random.seed(0)
     # G = A*B
     A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8"))
-    A = np.ones(A_size, dtype=np.dtype("int8"))
+    # convert from row-major to block row-major
+    A = np.reshape(A, [2, 8, 2, 8])
+    # convert to [2,2,8,8]
+    A = np.swapaxes(A, 1, 2)
     B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8"))
-    B = np.ones(B_size, dtype=np.dtype("int8"))
+    # convert from column-major to block column-major
+    B = np.reshape(B, [2, 8, 2, 8])
+    # convert to [2,2,8,8]
+    B = np.swapaxes(B, 1, 2)
     C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32")))
+    C_golden = np.reshape(C_golden, [2, 8, 2, 8])
+    C_golden = np.swapaxes(C_golden, 1, 2)
     C = np.zeros(C_golden.shape, np.dtype("int32"))
     variables = {"A": A, "B": B, "C_golden": C_golden, "C": C}
     assert A.shape[1] == B.shape[0]

From 98bfa2bec2df7c8ccd1cffc8b95cf77af8002862 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Thu, 14 Dec 2023 13:00:22 +0100
Subject: [PATCH 11/16] Add workflow for simple_matmul

---
 .github/workflows/build-run-kernel.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/build-run-kernel.yml b/.github/workflows/build-run-kernel.yml
index bee37fdd..c74be6b3 100644
--- a/.github/workflows/build-run-kernel.yml
+++ b/.github/workflows/build-run-kernel.yml
@@ -26,3 +26,8 @@ jobs:
           export PATH=/opt/python3.11/bin:$PATH
           make allrun
         working-directory: kernels/simple_copy
+      - name: Build and run kernel simple_matmul
+        run: |
+          export PATH=/opt/python3.11/bin:$PATH
+          make allrun
+        working-directory: kernels/simple_matmul

From a28c6ddbd25fcb05bca4b082853b3ffeb1ab2bad Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Thu, 14 Dec 2023 13:17:44 +0100
Subject: [PATCH 12/16] Fix data layout transformation in golden model output

---
 kernels/simple_matmul/gendata.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py
index 10e6700f..d7a9551d 100755
--- a/kernels/simple_matmul/gendata.py
+++ b/kernels/simple_matmul/gendata.py
@@ -46,23 +46,34 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]):
     A_size = [16, 16]
     B_size = [16, 16]
     np.random.seed(0)
-    # G = A*B
+
+    # C = A.B
     A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8"))
+    # A = np.ones(A_size, dtype=np.dtype("int8"))
+    B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8"))
+    # B = np.ones(B_size, dtype=np.dtype("int8"))
+    C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32")))
+    C = np.zeros(C_golden.shape, np.dtype("int32"))
+
+    assert A.shape[1] == B.shape[0]
+    sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]}
+
+    # Perform layout transformations before writing to memory
     # convert from row-major to block row-major
     A = np.reshape(A, [2, 8, 2, 8])
     # convert to [2,2,8,8]
     A = np.swapaxes(A, 1, 2)
-    B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8"))
+    B = np.transpose(B)
     # convert from column-major to block column-major
     B = np.reshape(B, [2, 8, 2, 8])
     # convert to [2,2,8,8]
     B = np.swapaxes(B, 1, 2)
-    C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32")))
+    # convert from row-major to block row-major
     C_golden = np.reshape(C_golden, [2, 8, 2, 8])
+    # convert to [2,2,8,8]
     C_golden = np.swapaxes(C_golden, 1, 2)
-    C = np.zeros(C_golden.shape, np.dtype("int32"))
+
     variables = {"A": A, "B": B, "C_golden": C_golden, "C": C}
-    assert A.shape[1] == B.shape[0]
-    sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]}
+
     create_header("data.h", sizes, variables)
     create_data("data.c", variables)

From 658f204b6161f35f26b2d66039a381dd31dcaf65 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Thu, 14 Dec 2023 13:21:37 +0100
Subject: [PATCH 13/16] Refactor gendata.py

---
 kernels/simple_matmul/gendata.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py
index d7a9551d..41ebbe06 100755
--- a/kernels/simple_matmul/gendata.py
+++ b/kernels/simple_matmul/gendata.py
@@ -59,21 +59,29 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]):
     sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]}
 
     # Perform layout transformations before writing to memory
+
     # convert from row-major to block row-major
-    A = np.reshape(A, [2, 8, 2, 8])
+    A_new_layout = np.reshape(A, [2, 8, 2, 8])
     # convert to [2,2,8,8]
-    A = np.swapaxes(A, 1, 2)
-    B = np.transpose(B)
+    A_new_layout = np.swapaxes(A_new_layout, 1, 2)
+
+    B_new_layout = np.transpose(B)
     # convert from column-major to block column-major
-    B = np.reshape(B, [2, 8, 2, 8])
+    B_new_layout = np.reshape(B_new_layout, [2, 8, 2, 8])
     # convert to [2,2,8,8]
-    B = np.swapaxes(B, 1, 2)
+    B_new_layout = np.swapaxes(B_new_layout, 1, 2)
     # convert from row-major to block row-major
-    C_golden = np.reshape(C_golden, [2, 8, 2, 8])
+    C_golden_new_layout = np.reshape(C_golden, [2, 8, 2, 8])
     # convert to [2,2,8,8]
-    C_golden = np.swapaxes(C_golden, 1, 2)
+    C_golden_new_layout = np.swapaxes(C_golden_new_layout, 1, 2)
 
-    variables = {"A": A, "B": B, "C_golden": C_golden, "C": C}
+    # C are just all zeros, so layout not important
+    variables = {
+        "A": A_new_layout,
+        "B": B_new_layout,
+        "C_golden": C_golden_new_layout,
+        "C": C,
+    }
 
     create_header("data.h", sizes, variables)
     create_data("data.c", variables)

From 7b8be56d36994f46951ba1b3225712b47db307f8 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Mon, 18 Dec 2023 16:02:46 +0100
Subject: [PATCH 14/16] Adress Joren's comments

---
 kernels/simple_matmul/gendata.py |  5 ++--
 kernels/simple_matmul/main.c     | 48 ++++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py
index 41ebbe06..ce2e8f1a 100755
--- a/kernels/simple_matmul/gendata.py
+++ b/kernels/simple_matmul/gendata.py
@@ -49,13 +49,12 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]):
 
     # C = A.B
     A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8"))
-    # A = np.ones(A_size, dtype=np.dtype("int8"))
     B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8"))
-    # B = np.ones(B_size, dtype=np.dtype("int8"))
+    # Make sure the product is possible!
+    assert A.shape[1] == B.shape[0]
     C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32")))
     C = np.zeros(C_golden.shape, np.dtype("int32"))
 
-    assert A.shape[1] == B.shape[0]
     sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]}
 
     # Perform layout transformations before writing to memory
diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c
index 0b284897..92fae803 100644
--- a/kernels/simple_matmul/main.c
+++ b/kernels/simple_matmul/main.c
@@ -1,12 +1,28 @@
+#include "stdint.h"
+
 #include "data.h"
 #include "memref.h"
-#include "snax-gemm-lib.h"
-#include "snax-gemm-params.h"
 #include "snax_rt.h"
-#include "stdint.h"
 
+/*
+ * These libraries are included from github.com/KULeuven-MICAS/snitch_cluster
+ * Interested users, might want to look at:
+ *
+ * /sw/snRuntime/api
+ * /target/snitch_cluster/sw/runtime/rtl/src
+ * /target/snitch_cluster/sw/runtime/common
+ * */
 #include <snrt.h>
-#include <stdint.h>
+
+/* These libraries are included from github.com/KULeuven-MICAS/snitch_cluster
+ * Interested users, might want to look at:
+ *
+ * /target/snitch_cluster/sw/snax/gemm/include"
+ * /target/snitch_cluster/sw/snax/mac/include"
+ *
+ * */
+#include "snax-gemm-lib.h"
+#include "snax-gemm-params.h"
 
 uint8_t Batch = 1;
 // meshRow, tileSize and meshCol are defined in snax-gemm-params.h
@@ -76,10 +92,13 @@ int main() {
   memrefA.aligned_data = memrefA.data;
   memrefA.shape[0] = M_size;
   memrefA.shape[1] = K_size;
-  // These are not considered correctly right now
+  // The following values of this memref are ignored right now.
+  // A 2D memref is not enough to express a tiled-block layout (=4D),
+  // necessary by the accelerator,
+  // Instead we use the variables strideInnermostA, ldA and strideA
   memrefA.offset = 0;
-  memrefA.stride[0] = sizeof(int8_t);
-  memrefA.stride[1] = sizeof(int8_t);
+  memrefA.stride[0] = 0;
+  memrefA.stride[1] = 0;
 
   TwoDMemrefI8_t memrefB;
   memrefB.data = allocated_b;
@@ -87,17 +106,23 @@ int main() {
   memrefB.aligned_data = memrefB.data + 64;
   memrefB.shape[0] = K_size;
   memrefB.shape[1] = N_size;
-  // These are not considered correctly right now
+  // The following values of this memref are ignored right now.
+  // A 2D memref is not enough to express a tiled-block layout (=4D),
+  // necessary by the accelerator,
+  // Instead we use the variables strideInnermostB, ldB and strideB.
   memrefB.offset = 0;
-  memrefB.stride[0] = sizeof(int8_t);
-  memrefB.stride[1] = sizeof(int8_t);
+  memrefB.stride[0] = 0;
+  memrefB.stride[1] = 0;
 
   TwoDMemrefI32_t memrefC;
   memrefC.data = allocated_c;
   memrefC.aligned_data = memrefC.data;
   memrefC.shape[0] = M_size;
   memrefC.shape[1] = N_size;
-  // These are not considered correctly right now
+  // The following values of this memref are ignored right now.
+  // A 2D memref is not enough to express a tiled-block layout (=4D),
+  // necessary by the accelerator,
+  // Instead we use the variables strideInnermostC, ldC and strideC
   memrefC.offset = 0;
   memrefC.stride[0] = sizeof(int32_t);
   memrefC.stride[1] = sizeof(int32_t);
@@ -123,7 +148,6 @@ int main() {
 
   int nerr = 0;
   for (int i = 0; i < M_size * N_size; i++) {
-    // printf("%d , golden : %d\n", memrefC.aligned_data[i],C_golden[i]);
     int32_t error = memrefC.aligned_data[i] - C_golden[i];
     if (error != 0)
       nerr += 1;

From 4a50c069f0a95e85ecba7f2c57a6430060092d28 Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Mon, 18 Dec 2023 16:04:51 +0100
Subject: [PATCH 15/16] Also improve comments on C memref

---
 kernels/simple_matmul/main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c
index 92fae803..1e37e3c9 100644
--- a/kernels/simple_matmul/main.c
+++ b/kernels/simple_matmul/main.c
@@ -124,8 +124,9 @@ int main() {
   // necessary by the accelerator,
   // Instead we use the variables strideInnermostC, ldC and strideC
   memrefC.offset = 0;
-  memrefC.stride[0] = sizeof(int32_t);
-  memrefC.stride[1] = sizeof(int32_t);
+  memrefC.stride[0] = 0;
+  memrefC.stride[1] = 0;
+
   if (snrt_is_dm_core()) {
     load_input_data(Batch, M_size / meshRow, K_size / tileSize,
                     N_size / meshCol, memrefA.aligned_data,

From 6334e8968df8bc1f1f90d2fa996a1cf7db5f394a Mon Sep 17 00:00:00 2001
From: Josse Van Delm <josse.vandelm@gmail.com>
Date: Mon, 18 Dec 2023 16:13:29 +0100
Subject: [PATCH 16/16] Remove print statements from simple_mult

---
 kernels/simple_mult/main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernels/simple_mult/main.c b/kernels/simple_mult/main.c
index 6fdc7181..ffbd75de 100644
--- a/kernels/simple_mult/main.c
+++ b/kernels/simple_mult/main.c
@@ -55,7 +55,6 @@ int main() {
 
   int nerr = 0;
   for (int i = 0; i < N; i++) {
-    printf("result: %d golden: %d\n", memrefD.aligned_data[i], G[i]);
     int32_t error = memrefD.aligned_data[i] - G[i];
     if (error != 0)
       nerr += 1;