From b689b01073588d20245f251b1752f16723ee8eea Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Tue, 5 Dec 2023 18:03:43 +0100 Subject: [PATCH 01/16] Move specific snax-library out of common makefile rules --- runtime/Makefile.rules | 4 ---- runtime/snax-mac.rules | 2 ++ 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/runtime/Makefile.rules b/runtime/Makefile.rules index 6b5490de..18a20734 100644 --- a/runtime/Makefile.rules +++ b/runtime/Makefile.rules @@ -15,7 +15,6 @@ MLIRTRANSLATE = mlir-translate-16 SNAXOPT = $(MAKEFILE_RULES_DIRNAME)/../compiler/snax-opt PYTHON = /opt/python3.11/bin/python3 -CFLAGS = # Mixing .c and .ll files makes some flags, useful for the former, # unused for the latter (e.g. -I) CFLAGS += -Wno-unused-command-line-argument @@ -32,7 +31,6 @@ CFLAGS += -I$(SNITCH_SW_PATH)/sw/math/src/include CFLAGS += -I$(SNITCH_SW_PATH)/sw/math/src/internal CFLAGS += -I$(SNITCH_SW_PATH)/sw/math/include/bits CFLAGS += -I$(SNITCH_SW_PATH)/sw/math/include -CFLAGS += -I$(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/mac/include CFLAGS += -I$(MAKEFILE_RULES_DIRNAME)include CFLAGS += -D__DEFINED_uint64_t CFLAGS += -menable-experimental-extensions @@ -45,7 +43,6 @@ CFLAGS += -fno-builtin-printf CFLAGS += -fno-common CFLAGS += -O3 -LDFLAGS = LDFLAGS += -fuse-ld=$(SNITCH_LLVM_PATH)/bin/ld.lld LDFLAGS += -L$(SNITCH_LLVM_PATH)/lib/clang/12.0.1/lib/ LDFLAGS += -T$(SNITCH_SW_PATH)/sw/snRuntime/base.ld @@ -56,7 +53,6 @@ LDFLAGS += -nostdlib LDFLAGS += -lclang_rt.builtins-riscv32 LDFLAGS += -lc LDFLAGS += -lsnRuntime -LDFLAGS += $(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/mac/build/mac.o # useful for debugging at llvm level: %.ll: %.c diff --git a/runtime/snax-mac.rules b/runtime/snax-mac.rules index c1da31cf..477d09d1 100644 --- a/runtime/snax-mac.rules +++ b/runtime/snax-mac.rules @@ -1,3 +1,5 @@ # Specific settings for snax-mac RTL SNITCH_SW_PATH = /opt/snax-mac VLTSIM = /opt/snax-mac-rtl/bin/snitch_cluster.vlt +CFLAGS += -I$(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/mac/include +LDFLAGS += $(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/mac/build/mac.o From 27735a4dfc7e3cb85cafeca842c1661f6ffb3efa Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Tue, 5 Dec 2023 18:39:05 +0100 Subject: [PATCH 02/16] Add 2D variants of i8 and i32 memrefs --- runtime/include/memref.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/runtime/include/memref.h b/runtime/include/memref.h index f4e9e2ce..c7742617 100644 --- a/runtime/include/memref.h +++ b/runtime/include/memref.h @@ -12,4 +12,26 @@ struct OneDMemrefI32 { uint32_t stride[1]; }; +struct TwoDMemrefI32 { + int32_t *data; // allocated pointer: Pointer to data buffer as allocated, + // only used for deallocating the memref + int32_t *aligned_data; // aligned pointer: Pointer to properly aligned data + // that memref indexes + uint32_t offset; + uint32_t shape[2]; + uint32_t stride[2]; +}; + +struct TwoDMemrefI8 { + int8_t *data; // allocated pointer: Pointer to data buffer as allocated, + // only used for deallocating the memref + int8_t *aligned_data; // aligned pointer: Pointer to properly aligned data + // that memref indexes + uint32_t offset; + uint32_t shape[2]; + uint32_t stride[2]; +}; + typedef struct OneDMemrefI32 OneDMemrefI32_t; +typedef struct TwoDMemrefI8 TwoDMemrefI8_t; +typedef struct TwoDMemrefI32 TwoDMemrefI32_t; From 09876df7276ae945a0d2a2804147ce0b6dff71d8 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Tue, 5 Dec 2023 18:40:04 +0100 Subject: [PATCH 03/16] Add snax-gemm makefile rules --- runtime/snax-gemm.rules | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 runtime/snax-gemm.rules diff --git a/runtime/snax-gemm.rules b/runtime/snax-gemm.rules new file mode 100644 index 00000000..57211d49 --- /dev/null +++ b/runtime/snax-gemm.rules @@ -0,0 +1,5 @@ +# Specific settings for snax-mac RTL +SNITCH_SW_PATH = /opt/snax-gemm +VLTSIM = /opt/snax-gemm-rtl/bin/snitch_cluster.vlt +CFLAGS += -I$(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/gemm/include +LDFLAGS += $(SNITCH_SW_PATH)/target/snitch_cluster/sw/snax/gemm/build/snax-gemm-lib.o From 9ca258c6f1cdf98a1bf9baed5d6e1a48c3b726a4 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Tue, 5 Dec 2023 18:41:30 +0100 Subject: [PATCH 04/16] Add WIP version of baseline --- kernels/simple_matmul/Makefile | 34 ++++++++++++++++ kernels/simple_matmul/baseline.c | 31 ++++++++++++++ kernels/simple_matmul/data.c | 10 +++++ kernels/simple_matmul/data.h | 9 ++++ kernels/simple_matmul/main.c | 70 ++++++++++++++++++++++++++++++++ 5 files changed, 154 insertions(+) create mode 100644 kernels/simple_matmul/Makefile create mode 100644 kernels/simple_matmul/baseline.c create mode 100644 kernels/simple_matmul/data.c create mode 100644 kernels/simple_matmul/data.h create mode 100644 kernels/simple_matmul/main.c diff --git a/kernels/simple_matmul/Makefile b/kernels/simple_matmul/Makefile new file mode 100644 index 00000000..c0bb66ea --- /dev/null +++ b/kernels/simple_matmul/Makefile @@ -0,0 +1,34 @@ +# Courtesy of Federico Ficarelli + +.DEFAULT_GOAL := all + +include ../../runtime/snax-gemm.rules +include ../../runtime/Makefile.rules + +TESTS = +TESTS += baseline.x +TESTS += linalg.x + +CFLAGS += -std=gnu11 +CFLAGS += -Wall -Wextra + +data.c data.h: + $(PYTHON) gendata.py + +%.x: %.o main.o data.o + $(LD) $(LDFLAGS) $^ -o $@ + +sim_%: % + rm -fr ./logs/ + $(VLTSIM) $< + +RUN = $(addprefix run_, $(TESTS)) +$(RUN): run_%: sim_% + mv logs $(subst sim_,,$<).logs + +all: $(TESTS) + +allrun: $(RUN) + +clean: + rm -fr *.ll12 *.x *.o *.logs/ logs/ data.h data.c diff --git a/kernels/simple_matmul/baseline.c b/kernels/simple_matmul/baseline.c new file mode 100644 index 00000000..af919129 --- /dev/null +++ b/kernels/simple_matmul/baseline.c @@ -0,0 +1,31 @@ +// #include "data.h" +#include "memref.h" +#include "snax-gemm-lib.h" + +#include + +#include + +void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *A, TwoDMemrefI8_t *B, + TwoDMemrefI32_t *C) { + uint8_t Batch = 1; + uint8_t M = (uint8_t)A->shape[0]; + uint8_t K = (uint8_t)A->shape[1]; + uint8_t N = (uint8_t)B->shape[1]; + int8_t *start_addr_a = A->aligned_data; + int8_t *start_addr_b = B->aligned_data; + int32_t *start_addr_c = C->aligned_data; + // TODO extract parameters below from memref? + uint32_t strideInnermostA = 256; + uint32_t strideInnermostB = 256; + uint32_t strideInnermostC = 256; + uint32_t ldA = 2048; + uint32_t ldB = 2048; + uint32_t ldC = 1024; + uint32_t strideA = 0; + uint32_t strideB = 0; + uint32_t strideC = 0; + batch_gemm_cpu(Batch, M, K, N, start_addr_a, start_addr_b, start_addr_c, + strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB, + ldC, strideA, strideB, strideC); +} diff --git a/kernels/simple_matmul/data.c b/kernels/simple_matmul/data.c new file mode 100644 index 00000000..2d104892 --- /dev/null +++ b/kernels/simple_matmul/data.c @@ -0,0 +1,10 @@ +#include "data.h" + +const int8_t A[N_size * K_size] = {44, -81, -11, 64, -61, + 123, 67, -25, -119, 83}; + +const int8_t B[K_size * M_size] = {-107, 114, -92, -41, -58, + 88, -40, 12, -70, 65}; + +const int32_t G[N_size * M_size] = {-4708, -9234, 1012, -2624, 3538, + 10824, -2680, -300, 8330, 5395}; diff --git a/kernels/simple_matmul/data.h b/kernels/simple_matmul/data.h new file mode 100644 index 00000000..4154c644 --- /dev/null +++ b/kernels/simple_matmul/data.h @@ -0,0 +1,9 @@ +#include +#pragma once + +#define N_size 16 +#define K_size 16 +#define M_size 16 +extern const int8_t A[N_size * K_size]; +extern const int8_t B[K_size * M_size]; +extern const int32_t G[N_size * M_size]; diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c new file mode 100644 index 00000000..afad824d --- /dev/null +++ b/kernels/simple_matmul/main.c @@ -0,0 +1,70 @@ +#include "data.h" +#include "memref.h" +#include "snax-gemm-lib.h" +#include "snax_rt.h" +#include "stdint.h" + +#include +#include + +// Kernel provided via external definition +void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, + TwoDMemrefI32_t *c); + +// void _mlir_ciface_snax_hwpe_mult(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, +// TwoDMemrefI32_t *c) { +// +// set_batch_gemm(a->aligned_data, b->aligned_data, c->aligned_data, +// a->shape[0]); +// start_batch_gemm(); +// wait_batch_gemm(); +// } + +int main() { + + // Create memref objects for data stored in L1 + TwoDMemrefI8_t memrefA; + memrefA.data = &A; + memrefA.aligned_data = memrefA.data; + memrefA.offset = 0; + memrefA.shape[0] = M_size; + memrefA.shape[1] = K_size; + memrefA.stride[0] = sizeof(int8_t); + memrefA.stride[1] = sizeof(int8_t); + + TwoDMemrefI8_t memrefB; + memrefB.data = &B; + memrefB.aligned_data = memrefB.data; + memrefA.offset = 0; + memrefA.shape[0] = K_size; + memrefA.shape[1] = N_size; + memrefA.stride[0] = sizeof(int8_t); + memrefA.stride[1] = sizeof(int8_t); + + TwoDMemrefI32_t memrefC; + memrefC.data = &G; + memrefC.aligned_data = memrefC.data; + memrefC.offset = 0; + memrefC.shape[0] = M_size; + memrefC.shape[1] = N_size; + memrefC.stride[0] = sizeof(int32_t); + memrefC.stride[1] = sizeof(int32_t); + + (void)snrt_mcycle(); + _mlir_ciface_simple_matmul(&memrefA, &memrefB, &memrefC); + (void)snrt_mcycle(); + + // Correctness check - + // from this point on only core 0 is required to be alive. + int thiscore = snrt_cluster_core_idx(); + if (thiscore != 0) + return 0; + + int nerr = 0; + for (int i = 0; i < M_size * N_size; i++) { + int32_t error = memrefC.aligned_data[i] - G[i]; + if (error != 0) + nerr += 1; + } + return nerr; +} From 3c5a06a8433a93e6637c7cf2ae023787f76d6167 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Tue, 12 Dec 2023 11:27:14 +0100 Subject: [PATCH 05/16] Add automatic data generation --- kernels/simple_matmul/Makefile | 1 - kernels/simple_matmul/baseline.c | 12 +++---- kernels/simple_matmul/data.c | 10 ------ kernels/simple_matmul/data.h | 9 ----- kernels/simple_matmul/gendata.py | 58 ++++++++++++++++++++++++++++++++ kernels/simple_matmul/main.c | 4 +-- 6 files changed, 66 insertions(+), 28 deletions(-) delete mode 100644 kernels/simple_matmul/data.c delete mode 100644 kernels/simple_matmul/data.h create mode 100755 kernels/simple_matmul/gendata.py diff --git a/kernels/simple_matmul/Makefile b/kernels/simple_matmul/Makefile index c0bb66ea..5f3adbf1 100644 --- a/kernels/simple_matmul/Makefile +++ b/kernels/simple_matmul/Makefile @@ -7,7 +7,6 @@ include ../../runtime/Makefile.rules TESTS = TESTS += baseline.x -TESTS += linalg.x CFLAGS += -std=gnu11 CFLAGS += -Wall -Wextra diff --git a/kernels/simple_matmul/baseline.c b/kernels/simple_matmul/baseline.c index af919129..fb2ee3a2 100644 --- a/kernels/simple_matmul/baseline.c +++ b/kernels/simple_matmul/baseline.c @@ -9,9 +9,9 @@ void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *A, TwoDMemrefI8_t *B, TwoDMemrefI32_t *C) { uint8_t Batch = 1; - uint8_t M = (uint8_t)A->shape[0]; - uint8_t K = (uint8_t)A->shape[1]; - uint8_t N = (uint8_t)B->shape[1]; + uint8_t M_size = (uint8_t)A->shape[0]; + uint8_t K_size = (uint8_t)A->shape[1]; + uint8_t N_size = (uint8_t)B->shape[1]; int8_t *start_addr_a = A->aligned_data; int8_t *start_addr_b = B->aligned_data; int32_t *start_addr_c = C->aligned_data; @@ -25,7 +25,7 @@ void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *A, TwoDMemrefI8_t *B, uint32_t strideA = 0; uint32_t strideB = 0; uint32_t strideC = 0; - batch_gemm_cpu(Batch, M, K, N, start_addr_a, start_addr_b, start_addr_c, - strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB, - ldC, strideA, strideB, strideC); + batch_gemm_cpu(Batch, M_size, K_size, N_size, start_addr_a, start_addr_b, + start_addr_c, strideInnermostA, strideInnermostB, + strideInnermostC, ldA, ldB, ldC, strideA, strideB, strideC); } diff --git a/kernels/simple_matmul/data.c b/kernels/simple_matmul/data.c deleted file mode 100644 index 2d104892..00000000 --- a/kernels/simple_matmul/data.c +++ /dev/null @@ -1,10 +0,0 @@ -#include "data.h" - -const int8_t A[N_size * K_size] = {44, -81, -11, 64, -61, - 123, 67, -25, -119, 83}; - -const int8_t B[K_size * M_size] = {-107, 114, -92, -41, -58, - 88, -40, 12, -70, 65}; - -const int32_t G[N_size * M_size] = {-4708, -9234, 1012, -2624, 3538, - 10824, -2680, -300, 8330, 5395}; diff --git a/kernels/simple_matmul/data.h b/kernels/simple_matmul/data.h deleted file mode 100644 index 4154c644..00000000 --- a/kernels/simple_matmul/data.h +++ /dev/null @@ -1,9 +0,0 @@ -#include -#pragma once - -#define N_size 16 -#define K_size 16 -#define M_size 16 -extern const int8_t A[N_size * K_size]; -extern const int8_t B[K_size * M_size]; -extern const int32_t G[N_size * M_size]; diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py new file mode 100755 index 00000000..f5304f5c --- /dev/null +++ b/kernels/simple_matmul/gendata.py @@ -0,0 +1,58 @@ +# simple script to generate inputs and expected outputs for simple_matmult +import numpy as np +from numpy import typing as npt +from typing import Dict + + +def create_header( + file_name: str, sizes: Dict[str, int], variables: Dict[str, npt.NDArray] +) -> None: + with open(file_name, "w") as f: + includes = ["#include ", "#pragma once", ""] + includes = "\n".join(includes) + variables_string = [""] + for i, j in sizes.items(): + variables_string.append(f"#define {i} {j}") + variables_string.append("") + for i, j in variables.items(): + variables_string.append(f"extern const {j.dtype}_t {i}[{j.size}];") + variables_string = "\n".join(variables_string) + f.write(includes) + f.write(variables_string) + f.write("\n") + + +def create_data(file_name: str, variables: Dict[str, npt.NDArray]): + includes = ['#include "data.h"', "", ""] + includes = "\n".join(includes) + variables = {i: np.reshape(j, j.size) for i, j in variables.items()} + with open(file_name, "w") as f: + f.write(includes) + for variable_name, variable_value in variables.items(): + f.write( + f"const {variable_value.dtype}_t {variable_name}" + + f"[{variable_value.size}] = " + + "{\n" + ) + variable_str = ["\t" + str(i) for i in variable_value] + f.write(",\n".join(variable_str)) + f.write("\n};\n\n") + + +if __name__ == "__main__": + # Reset random seed for reproducible behavior + low_bound = -128 + high_bound = 127 + A_size = [16, 16] + B_size = [16, 32] + np.random.seed(0) + # G = A*B + A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8")) + B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8")) + C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32"))) + C = np.zeros(C_golden.shape, np.dtype("int32")) + variables = {"A": A, "B": B, "C_golden": C_golden, "C": C} + assert A.shape[1] == B.shape[0] + sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]} + create_header("data.h", sizes, variables) + create_data("data.c", variables) diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c index afad824d..e6e8b225 100644 --- a/kernels/simple_matmul/main.c +++ b/kernels/simple_matmul/main.c @@ -42,7 +42,7 @@ int main() { memrefA.stride[1] = sizeof(int8_t); TwoDMemrefI32_t memrefC; - memrefC.data = &G; + memrefC.data = &C; memrefC.aligned_data = memrefC.data; memrefC.offset = 0; memrefC.shape[0] = M_size; @@ -62,7 +62,7 @@ int main() { int nerr = 0; for (int i = 0; i < M_size * N_size; i++) { - int32_t error = memrefC.aligned_data[i] - G[i]; + int32_t error = memrefC.aligned_data[i] - C_golden[i]; if (error != 0) nerr += 1; } From c6c5142e2bae3ba74c38f2d9d625f61b5ba65aa4 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Wed, 13 Dec 2023 09:45:31 +0100 Subject: [PATCH 06/16] Add caller from mlir --- kernels/simple_matmul/Makefile | 2 +- kernels/simple_matmul/baseline.c | 31 ------------------------------- kernels/simple_matmul/cpu.mlir | 7 +++++++ 3 files changed, 8 insertions(+), 32 deletions(-) delete mode 100644 kernels/simple_matmul/baseline.c create mode 100644 kernels/simple_matmul/cpu.mlir diff --git a/kernels/simple_matmul/Makefile b/kernels/simple_matmul/Makefile index 5f3adbf1..ac165f01 100644 --- a/kernels/simple_matmul/Makefile +++ b/kernels/simple_matmul/Makefile @@ -6,7 +6,7 @@ include ../../runtime/snax-gemm.rules include ../../runtime/Makefile.rules TESTS = -TESTS += baseline.x +TESTS += cpu.x CFLAGS += -std=gnu11 CFLAGS += -Wall -Wextra diff --git a/kernels/simple_matmul/baseline.c b/kernels/simple_matmul/baseline.c deleted file mode 100644 index fb2ee3a2..00000000 --- a/kernels/simple_matmul/baseline.c +++ /dev/null @@ -1,31 +0,0 @@ -// #include "data.h" -#include "memref.h" -#include "snax-gemm-lib.h" - -#include - -#include - -void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *A, TwoDMemrefI8_t *B, - TwoDMemrefI32_t *C) { - uint8_t Batch = 1; - uint8_t M_size = (uint8_t)A->shape[0]; - uint8_t K_size = (uint8_t)A->shape[1]; - uint8_t N_size = (uint8_t)B->shape[1]; - int8_t *start_addr_a = A->aligned_data; - int8_t *start_addr_b = B->aligned_data; - int32_t *start_addr_c = C->aligned_data; - // TODO extract parameters below from memref? - uint32_t strideInnermostA = 256; - uint32_t strideInnermostB = 256; - uint32_t strideInnermostC = 256; - uint32_t ldA = 2048; - uint32_t ldB = 2048; - uint32_t ldC = 1024; - uint32_t strideA = 0; - uint32_t strideB = 0; - uint32_t strideC = 0; - batch_gemm_cpu(Batch, M_size, K_size, N_size, start_addr_a, start_addr_b, - start_addr_c, strideInnermostA, strideInnermostB, - strideInnermostC, ldA, ldB, ldC, strideA, strideB, strideC); -} diff --git a/kernels/simple_matmul/cpu.mlir b/kernels/simple_matmul/cpu.mlir new file mode 100644 index 00000000..8c6b59a6 --- /dev/null +++ b/kernels/simple_matmul/cpu.mlir @@ -0,0 +1,7 @@ +func.func public @simple_matmul(%A: memref<16x16xi8, 1 : i32>, + %B: memref<16x32xi8, 1 : i32>, + %C: memref<16x32xi32, 1 : i32>) -> () { + func.call @simple_matmul_cpu(%A, %B, %C) : (memref<16x16xi8, 1 : i32>, memref<16x32xi8, 1 : i32>, memref<16x32xi32, 1 : i32>) -> () + return +} +func.func private @simple_matmul_cpu(%A : memref<16x16xi8, 1 : i32>, %B : memref<16x32xi8, 1 : i32>, %C : memref<16x32xi32, 1 : i32>) From 360c183259064caf081932c952b8c3dca560dcb8 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Wed, 13 Dec 2023 11:08:03 +0100 Subject: [PATCH 07/16] Add function to setup CPU kernel --- kernels/simple_matmul/main.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c index e6e8b225..b4eb7f31 100644 --- a/kernels/simple_matmul/main.c +++ b/kernels/simple_matmul/main.c @@ -1,6 +1,7 @@ #include "data.h" #include "memref.h" #include "snax-gemm-lib.h" +#include "snax-gemm-params.h" #include "snax_rt.h" #include "stdint.h" @@ -11,14 +12,32 @@ void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, TwoDMemrefI32_t *c); -// void _mlir_ciface_snax_hwpe_mult(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, -// TwoDMemrefI32_t *c) { -// -// set_batch_gemm(a->aligned_data, b->aligned_data, c->aligned_data, -// a->shape[0]); -// start_batch_gemm(); -// wait_batch_gemm(); -// } +void _mlir_ciface_simple_matmul_cpu(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, + TwoDMemrefI32_t *c) { + uint8_t Batch = 1; + // meshRow, tileSize and meshCol are defined in snax-gemm-params.h + uint8_t M_param = M_size / meshRow; + uint8_t K_param = K_size / tileSize; + uint8_t N_param = N_size / meshCol; + int8_t *A_ptr = a->aligned_data; + int8_t *B_ptr = b->aligned_data; + int32_t *C_ptr = c->aligned_data; + // Extracted from datagen.py in snitch_cluster repo + uint32_t strideInnermostA = 256; + uint32_t strideInnermostB = 256; + uint32_t strideInnermostC = 256; + uint32_t ldA = 2048; + uint32_t ldB = 2048; + uint32_t ldC = 1024; + uint32_t strideA = 0; + uint32_t strideB = 0; + uint32_t strideC = 0; + // delta_local_a: 64, + // delta_local_b: 8192 + batch_gemm_cpu(Batch, M_param, K_param, N_param, A_ptr, B_ptr, C_ptr, + strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB, + ldC, strideA, strideB, strideC); +} int main() { From ae7481c3e58638eb84b264cd609aeba6104319f3 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Thu, 14 Dec 2023 10:12:25 +0100 Subject: [PATCH 08/16] Add a first version of allocation --- kernels/simple_matmul/main.c | 90 +++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 26 deletions(-) diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c index b4eb7f31..58bfd86b 100644 --- a/kernels/simple_matmul/main.c +++ b/kernels/simple_matmul/main.c @@ -8,66 +8,104 @@ #include #include +uint8_t Batch = 1; +// meshRow, tileSize and meshCol are defined in snax-gemm-params.h +uint8_t M_param = M_size / meshRow; +uint8_t K_param = K_size / tileSize; +uint8_t N_param = N_size / meshCol; +// Extracted from datagen.py in snitch_cluster repo +uint32_t strideInnermostA = 256; +uint32_t strideInnermostB = 256; +uint32_t strideInnermostC = 256; +uint32_t ldA = 2048; +uint32_t ldB = 2048; +uint32_t ldC = 1024; +uint32_t strideA = 0; +uint32_t strideB = 0; +uint32_t strideC = 0; + // Kernel provided via external definition void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, TwoDMemrefI32_t *c); void _mlir_ciface_simple_matmul_cpu(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, TwoDMemrefI32_t *c) { - uint8_t Batch = 1; - // meshRow, tileSize and meshCol are defined in snax-gemm-params.h - uint8_t M_param = M_size / meshRow; - uint8_t K_param = K_size / tileSize; - uint8_t N_param = N_size / meshCol; int8_t *A_ptr = a->aligned_data; int8_t *B_ptr = b->aligned_data; int32_t *C_ptr = c->aligned_data; - // Extracted from datagen.py in snitch_cluster repo - uint32_t strideInnermostA = 256; - uint32_t strideInnermostB = 256; - uint32_t strideInnermostC = 256; - uint32_t ldA = 2048; - uint32_t ldB = 2048; - uint32_t ldC = 1024; - uint32_t strideA = 0; - uint32_t strideB = 0; - uint32_t strideC = 0; - // delta_local_a: 64, - // delta_local_b: 8192 batch_gemm_cpu(Batch, M_param, K_param, N_param, A_ptr, B_ptr, C_ptr, strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB, ldC, strideA, strideB, strideC); } int main() { + // Allocate space in TCDM + // We put the data in different banks, but we don't interleave the data for + // now. + // + // | A | x | x | x | --> A in banks 0 - 7 --> (8/32 banks used)* + // (int8 --> 8 elements/bank) + // 1 row --> 64 elements + // | x | B | x | x | --> B in banks 7 - 15 --> (8/32 banks used)* + // (8 elements/bank)*32 banks + // 1 row --> 64 elements + // | C | C | C | C | --> C in banks 0 - 31 --> (32/32 banks used)* + // (2 elements/bank)* 32 bank + // 1 row --> 64 elements + // | x | x | x | x | + // + // 32 banks --> 1 row = 32 banks * 8 bytes --> 256 adresses further + + static int8_t *allocated_a; + static int8_t *allocated_b; + static int32_t *allocated_c; + + // Transfer data from L3 to L1 + // Using DMA only + if (snrt_is_dm_core()) { + // calculation in bytes directly + allocated_a = (int8_t *)snrt_l1alloc(256 * M_size * K_size / 64); + allocated_b = (int8_t *)snrt_l1alloc(256 * K_size * N_size / 64); + allocated_c = (int32_t *)snrt_l1alloc(256 * M_size * K_size / 64); + } - // Create memref objects for data stored in L1 + // Create memref descriptors for data stored in L1 TwoDMemrefI8_t memrefA; memrefA.data = &A; memrefA.aligned_data = memrefA.data; - memrefA.offset = 0; memrefA.shape[0] = M_size; memrefA.shape[1] = K_size; + // These are not considered correctly right now + memrefA.offset = 0; memrefA.stride[0] = sizeof(int8_t); memrefA.stride[1] = sizeof(int8_t); TwoDMemrefI8_t memrefB; memrefB.data = &B; - memrefB.aligned_data = memrefB.data; - memrefA.offset = 0; - memrefA.shape[0] = K_size; - memrefA.shape[1] = N_size; - memrefA.stride[0] = sizeof(int8_t); - memrefA.stride[1] = sizeof(int8_t); + // Data is stored in banks 8 - 15, so increment by 8banks*8bytes = 64 + memrefB.aligned_data = memrefB.data + 64; + memrefB.shape[0] = K_size; + memrefB.shape[1] = N_size; + // These are not considered correctly right now + memrefB.offset = 0; + memrefB.stride[0] = sizeof(int8_t); + memrefB.stride[1] = sizeof(int8_t); TwoDMemrefI32_t memrefC; memrefC.data = &C; memrefC.aligned_data = memrefC.data; - memrefC.offset = 0; memrefC.shape[0] = M_size; memrefC.shape[1] = N_size; + // These are not considered correctly right now + memrefC.offset = 0; memrefC.stride[0] = sizeof(int32_t); memrefC.stride[1] = sizeof(int32_t); + if (snrt_is_dm_core()) { + load_input_data(Batch, M_size / meshRow, K_size / tileSize, + N_size / meshCol, memrefA.aligned_data, + memrefB.aligned_data, A, B, strideInnermostA, + strideInnermostB, ldA, ldB, strideA, strideB); + } (void)snrt_mcycle(); _mlir_ciface_simple_matmul(&memrefA, &memrefB, &memrefC); From 9c0dfb96ddb9e90cee0f063abbe1525aaed30bb4 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Thu, 14 Dec 2023 11:54:11 +0100 Subject: [PATCH 09/16] Add working example for all ones --- kernels/simple_matmul/gendata.py | 4 +++- kernels/simple_matmul/main.c | 31 ++++++++++++++++++------------- kernels/simple_mult/main.c | 1 + 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py index f5304f5c..0819f677 100755 --- a/kernels/simple_matmul/gendata.py +++ b/kernels/simple_matmul/gendata.py @@ -44,11 +44,13 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]): low_bound = -128 high_bound = 127 A_size = [16, 16] - B_size = [16, 32] + B_size = [16, 16] np.random.seed(0) # G = A*B A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8")) + A = np.ones(A_size, dtype=np.dtype("int8")) B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8")) + B = np.ones(B_size, dtype=np.dtype("int8")) C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32"))) C = np.zeros(C_golden.shape, np.dtype("int32")) variables = {"A": A, "B": B, "C_golden": C_golden, "C": C} diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c index 58bfd86b..0b284897 100644 --- a/kernels/simple_matmul/main.c +++ b/kernels/simple_matmul/main.c @@ -17,9 +17,9 @@ uint8_t N_param = N_size / meshCol; uint32_t strideInnermostA = 256; uint32_t strideInnermostB = 256; uint32_t strideInnermostC = 256; -uint32_t ldA = 2048; -uint32_t ldB = 2048; -uint32_t ldC = 1024; +uint32_t ldA = 512; +uint32_t ldB = 512; +uint32_t ldC = 512; uint32_t strideA = 0; uint32_t strideB = 0; uint32_t strideC = 0; @@ -30,10 +30,10 @@ void _mlir_ciface_simple_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, void _mlir_ciface_simple_matmul_cpu(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, TwoDMemrefI32_t *c) { - int8_t *A_ptr = a->aligned_data; - int8_t *B_ptr = b->aligned_data; - int32_t *C_ptr = c->aligned_data; - batch_gemm_cpu(Batch, M_param, K_param, N_param, A_ptr, B_ptr, C_ptr, + int8_t *a_ptr = a->aligned_data; + int8_t *b_ptr = b->aligned_data; + int32_t *c_ptr = c->aligned_data; + batch_gemm_cpu(Batch, M_param, K_param, N_param, a_ptr, b_ptr, c_ptr, strideInnermostA, strideInnermostB, strideInnermostC, ldA, ldB, ldC, strideA, strideB, strideC); } @@ -66,12 +66,13 @@ int main() { // calculation in bytes directly allocated_a = (int8_t *)snrt_l1alloc(256 * M_size * K_size / 64); allocated_b = (int8_t *)snrt_l1alloc(256 * K_size * N_size / 64); - allocated_c = (int32_t *)snrt_l1alloc(256 * M_size * K_size / 64); + allocated_c = (int32_t *)snrt_l1alloc(256 * M_size * N_size / 64); } + snrt_cluster_hw_barrier(); // Create memref descriptors for data stored in L1 TwoDMemrefI8_t memrefA; - memrefA.data = &A; + memrefA.data = allocated_a; memrefA.aligned_data = memrefA.data; memrefA.shape[0] = M_size; memrefA.shape[1] = K_size; @@ -81,7 +82,7 @@ int main() { memrefA.stride[1] = sizeof(int8_t); TwoDMemrefI8_t memrefB; - memrefB.data = &B; + memrefB.data = allocated_b; // Data is stored in banks 8 - 15, so increment by 8banks*8bytes = 64 memrefB.aligned_data = memrefB.data + 64; memrefB.shape[0] = K_size; @@ -92,7 +93,7 @@ int main() { memrefB.stride[1] = sizeof(int8_t); TwoDMemrefI32_t memrefC; - memrefC.data = &C; + memrefC.data = allocated_c; memrefC.aligned_data = memrefC.data; memrefC.shape[0] = M_size; memrefC.shape[1] = N_size; @@ -106,10 +107,13 @@ int main() { memrefB.aligned_data, A, B, strideInnermostA, strideInnermostB, ldA, ldB, strideA, strideB); } - + snrt_cluster_hw_barrier(); (void)snrt_mcycle(); - _mlir_ciface_simple_matmul(&memrefA, &memrefB, &memrefC); + if (snrt_is_compute_core()) { + _mlir_ciface_simple_matmul(&memrefA, &memrefB, &memrefC); + } (void)snrt_mcycle(); + snrt_cluster_hw_barrier(); // Correctness check - // from this point on only core 0 is required to be alive. @@ -119,6 +123,7 @@ int main() { int nerr = 0; for (int i = 0; i < M_size * N_size; i++) { + // printf("%d , golden : %d\n", memrefC.aligned_data[i],C_golden[i]); int32_t error = memrefC.aligned_data[i] - C_golden[i]; if (error != 0) nerr += 1; diff --git a/kernels/simple_mult/main.c b/kernels/simple_mult/main.c index ffbd75de..6fdc7181 100644 --- a/kernels/simple_mult/main.c +++ b/kernels/simple_mult/main.c @@ -55,6 +55,7 @@ int main() { int nerr = 0; for (int i = 0; i < N; i++) { + printf("result: %d golden: %d\n", memrefD.aligned_data[i], G[i]); int32_t error = memrefD.aligned_data[i] - G[i]; if (error != 0) nerr += 1; From d8c31be3f80eef81e6c05bbd8fa851d5dca9985f Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Thu, 14 Dec 2023 12:58:44 +0100 Subject: [PATCH 10/16] Add data layout transformation to golden model --- kernels/simple_matmul/gendata.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py index 0819f677..10e6700f 100755 --- a/kernels/simple_matmul/gendata.py +++ b/kernels/simple_matmul/gendata.py @@ -48,10 +48,18 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]): np.random.seed(0) # G = A*B A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8")) - A = np.ones(A_size, dtype=np.dtype("int8")) + # convert from row-major to block row-major + A = np.reshape(A, [2, 8, 2, 8]) + # convert to [2,2,8,8] + A = np.swapaxes(A, 1, 2) B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8")) - B = np.ones(B_size, dtype=np.dtype("int8")) + # convert from column-major to block column-major + B = np.reshape(B, [2, 8, 2, 8]) + # convert to [2,2,8,8] + B = np.swapaxes(B, 1, 2) C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32"))) + C_golden = np.reshape(C_golden, [2, 8, 2, 8]) + C_golden = np.swapaxes(C_golden, 1, 2) C = np.zeros(C_golden.shape, np.dtype("int32")) variables = {"A": A, "B": B, "C_golden": C_golden, "C": C} assert A.shape[1] == B.shape[0] From 98bfa2bec2df7c8ccd1cffc8b95cf77af8002862 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Thu, 14 Dec 2023 13:00:22 +0100 Subject: [PATCH 11/16] Add workflow for simple_matmul --- .github/workflows/build-run-kernel.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/build-run-kernel.yml b/.github/workflows/build-run-kernel.yml index bee37fdd..c74be6b3 100644 --- a/.github/workflows/build-run-kernel.yml +++ b/.github/workflows/build-run-kernel.yml @@ -26,3 +26,8 @@ jobs: export PATH=/opt/python3.11/bin:$PATH make allrun working-directory: kernels/simple_copy + - name: Build and run kernel simple_matmul + run: | + export PATH=/opt/python3.11/bin:$PATH + make allrun + working-directory: kernels/simple_matmul From a28c6ddbd25fcb05bca4b082853b3ffeb1ab2bad Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Thu, 14 Dec 2023 13:17:44 +0100 Subject: [PATCH 12/16] Fix data layout transformation in golden model output --- kernels/simple_matmul/gendata.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py index 10e6700f..d7a9551d 100755 --- a/kernels/simple_matmul/gendata.py +++ b/kernels/simple_matmul/gendata.py @@ -46,23 +46,34 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]): A_size = [16, 16] B_size = [16, 16] np.random.seed(0) - # G = A*B + + # C = A.B A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8")) + # A = np.ones(A_size, dtype=np.dtype("int8")) + B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8")) + # B = np.ones(B_size, dtype=np.dtype("int8")) + C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32"))) + C = np.zeros(C_golden.shape, np.dtype("int32")) + + assert A.shape[1] == B.shape[0] + sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]} + + # Perform layout transformations before writing to memory # convert from row-major to block row-major A = np.reshape(A, [2, 8, 2, 8]) # convert to [2,2,8,8] A = np.swapaxes(A, 1, 2) - B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8")) + B = np.transpose(B) # convert from column-major to block column-major B = np.reshape(B, [2, 8, 2, 8]) # convert to [2,2,8,8] B = np.swapaxes(B, 1, 2) - C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32"))) + # convert from row-major to block row-major C_golden = np.reshape(C_golden, [2, 8, 2, 8]) + # convert to [2,2,8,8] C_golden = np.swapaxes(C_golden, 1, 2) - C = np.zeros(C_golden.shape, np.dtype("int32")) + variables = {"A": A, "B": B, "C_golden": C_golden, "C": C} - assert A.shape[1] == B.shape[0] - sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]} + create_header("data.h", sizes, variables) create_data("data.c", variables) From 658f204b6161f35f26b2d66039a381dd31dcaf65 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Thu, 14 Dec 2023 13:21:37 +0100 Subject: [PATCH 13/16] Refactor gendata.py --- kernels/simple_matmul/gendata.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py index d7a9551d..41ebbe06 100755 --- a/kernels/simple_matmul/gendata.py +++ b/kernels/simple_matmul/gendata.py @@ -59,21 +59,29 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]): sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]} # Perform layout transformations before writing to memory + # convert from row-major to block row-major - A = np.reshape(A, [2, 8, 2, 8]) + A_new_layout = np.reshape(A, [2, 8, 2, 8]) # convert to [2,2,8,8] - A = np.swapaxes(A, 1, 2) - B = np.transpose(B) + A_new_layout = np.swapaxes(A_new_layout, 1, 2) + + B_new_layout = np.transpose(B) # convert from column-major to block column-major - B = np.reshape(B, [2, 8, 2, 8]) + B_new_layout = np.reshape(B_new_layout, [2, 8, 2, 8]) # convert to [2,2,8,8] - B = np.swapaxes(B, 1, 2) + B_new_layout = np.swapaxes(B_new_layout, 1, 2) # convert from row-major to block row-major - C_golden = np.reshape(C_golden, [2, 8, 2, 8]) + C_golden_new_layout = np.reshape(C_golden, [2, 8, 2, 8]) # convert to [2,2,8,8] - C_golden = np.swapaxes(C_golden, 1, 2) + C_golden_new_layout = np.swapaxes(C_golden_new_layout, 1, 2) - variables = {"A": A, "B": B, "C_golden": C_golden, "C": C} + # C are just all zeros, so layout not important + variables = { + "A": A_new_layout, + "B": B_new_layout, + "C_golden": C_golden_new_layout, + "C": C, + } create_header("data.h", sizes, variables) create_data("data.c", variables) From 7b8be56d36994f46951ba1b3225712b47db307f8 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Mon, 18 Dec 2023 16:02:46 +0100 Subject: [PATCH 14/16] Adress Joren's comments --- kernels/simple_matmul/gendata.py | 5 ++-- kernels/simple_matmul/main.c | 48 ++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/kernels/simple_matmul/gendata.py b/kernels/simple_matmul/gendata.py index 41ebbe06..ce2e8f1a 100755 --- a/kernels/simple_matmul/gendata.py +++ b/kernels/simple_matmul/gendata.py @@ -49,13 +49,12 @@ def create_data(file_name: str, variables: Dict[str, npt.NDArray]): # C = A.B A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8")) - # A = np.ones(A_size, dtype=np.dtype("int8")) B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8")) - # B = np.ones(B_size, dtype=np.dtype("int8")) + # Make sure the product is possible! + assert A.shape[1] == B.shape[0] C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32"))) C = np.zeros(C_golden.shape, np.dtype("int32")) - assert A.shape[1] == B.shape[0] sizes = {"N_size": A.shape[0], "K_size": A.shape[1], "M_size": B.shape[1]} # Perform layout transformations before writing to memory diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c index 0b284897..92fae803 100644 --- a/kernels/simple_matmul/main.c +++ b/kernels/simple_matmul/main.c @@ -1,12 +1,28 @@ +#include "stdint.h" + #include "data.h" #include "memref.h" -#include "snax-gemm-lib.h" -#include "snax-gemm-params.h" #include "snax_rt.h" -#include "stdint.h" +/* + * These libraries are included from github.com/KULeuven-MICAS/snitch_cluster + * Interested users, might want to look at: + * + * /sw/snRuntime/api + * /target/snitch_cluster/sw/runtime/rtl/src + * /target/snitch_cluster/sw/runtime/common + * */ #include -#include + +/* These libraries are included from github.com/KULeuven-MICAS/snitch_cluster + * Interested users, might want to look at: + * + * /target/snitch_cluster/sw/snax/gemm/include" + * /target/snitch_cluster/sw/snax/mac/include" + * + * */ +#include "snax-gemm-lib.h" +#include "snax-gemm-params.h" uint8_t Batch = 1; // meshRow, tileSize and meshCol are defined in snax-gemm-params.h @@ -76,10 +92,13 @@ int main() { memrefA.aligned_data = memrefA.data; memrefA.shape[0] = M_size; memrefA.shape[1] = K_size; - // These are not considered correctly right now + // The following values of this memref are ignored right now. + // A 2D memref is not enough to express a tiled-block layout (=4D), + // necessary by the accelerator, + // Instead we use the variables strideInnermostA, ldA and strideA memrefA.offset = 0; - memrefA.stride[0] = sizeof(int8_t); - memrefA.stride[1] = sizeof(int8_t); + memrefA.stride[0] = 0; + memrefA.stride[1] = 0; TwoDMemrefI8_t memrefB; memrefB.data = allocated_b; @@ -87,17 +106,23 @@ int main() { memrefB.aligned_data = memrefB.data + 64; memrefB.shape[0] = K_size; memrefB.shape[1] = N_size; - // These are not considered correctly right now + // The following values of this memref are ignored right now. + // A 2D memref is not enough to express a tiled-block layout (=4D), + // necessary by the accelerator, + // Instead we use the variables strideInnermostB, ldB and strideB. memrefB.offset = 0; - memrefB.stride[0] = sizeof(int8_t); - memrefB.stride[1] = sizeof(int8_t); + memrefB.stride[0] = 0; + memrefB.stride[1] = 0; TwoDMemrefI32_t memrefC; memrefC.data = allocated_c; memrefC.aligned_data = memrefC.data; memrefC.shape[0] = M_size; memrefC.shape[1] = N_size; - // These are not considered correctly right now + // The following values of this memref are ignored right now. + // A 2D memref is not enough to express a tiled-block layout (=4D), + // necessary by the accelerator, + // Instead we use the variables strideInnermostC, ldC and strideC memrefC.offset = 0; memrefC.stride[0] = sizeof(int32_t); memrefC.stride[1] = sizeof(int32_t); @@ -123,7 +148,6 @@ int main() { int nerr = 0; for (int i = 0; i < M_size * N_size; i++) { - // printf("%d , golden : %d\n", memrefC.aligned_data[i],C_golden[i]); int32_t error = memrefC.aligned_data[i] - C_golden[i]; if (error != 0) nerr += 1; From 4a50c069f0a95e85ecba7f2c57a6430060092d28 Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Mon, 18 Dec 2023 16:04:51 +0100 Subject: [PATCH 15/16] Also improve comments on C memref --- kernels/simple_matmul/main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernels/simple_matmul/main.c b/kernels/simple_matmul/main.c index 92fae803..1e37e3c9 100644 --- a/kernels/simple_matmul/main.c +++ b/kernels/simple_matmul/main.c @@ -124,8 +124,9 @@ int main() { // necessary by the accelerator, // Instead we use the variables strideInnermostC, ldC and strideC memrefC.offset = 0; - memrefC.stride[0] = sizeof(int32_t); - memrefC.stride[1] = sizeof(int32_t); + memrefC.stride[0] = 0; + memrefC.stride[1] = 0; + if (snrt_is_dm_core()) { load_input_data(Batch, M_size / meshRow, K_size / tileSize, N_size / meshCol, memrefA.aligned_data, From 6334e8968df8bc1f1f90d2fa996a1cf7db5f394a Mon Sep 17 00:00:00 2001 From: Josse Van Delm Date: Mon, 18 Dec 2023 16:13:29 +0100 Subject: [PATCH 16/16] Remove print statements from simple_mult --- kernels/simple_mult/main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernels/simple_mult/main.c b/kernels/simple_mult/main.c index 6fdc7181..ffbd75de 100644 --- a/kernels/simple_mult/main.c +++ b/kernels/simple_mult/main.c @@ -55,7 +55,6 @@ int main() { int nerr = 0; for (int i = 0; i < N; i++) { - printf("result: %d golden: %d\n", memrefD.aligned_data[i], G[i]); int32_t error = memrefD.aligned_data[i] - G[i]; if (error != 0) nerr += 1;