From 0e1071ae6669644194b0042dcfcfa4f06257369a Mon Sep 17 00:00:00 2001 From: Xiaoling Yi <143962462+xiaoling-yi@users.noreply.github.com> Date: Mon, 23 Sep 2024 14:20:30 +0200 Subject: [PATCH] Delete old test for gemmx and add new gemmx test (#43) * delete old runtimes and sw * add gemmx runtime lib and sw test * add ci --- .../snax/snax-data-reshuffler/data/datagen.py | 511 ---------- .../snax-data-reshuffler/data/params.hjson | 44 - .../src/snax-data-reshuffler.c | 65 -- .../Makefile | 8 +- .../data/Makefile | 0 .../apps/snax/snax-gemmx/data/datagen.py | 903 ++++++++++++++++++ .../apps/snax/snax-gemmx/data/params.hjson | 12 + .../apps/snax/snax-gemmx/src/snax-gemmx.c | 128 +++ .../snax-streamer-gemm-conv-simd/Makefile | 27 - .../data/Makefile | 23 - .../data/datagen.py | 464 --------- .../data/params.hjson | 20 - .../src/snax-streamer-gemm-conv-simd.c | 114 --- .../snax/snax-streamer-gemm-conv/Makefile | 25 - .../snax-streamer-gemm-conv/data/Makefile | 23 - .../snax-streamer-gemm-conv/data/datagen.py | 514 ---------- .../snax-streamer-gemm-conv/data/params.hjson | 19 - .../src/snax-streamer-gemm-conv.c | 89 -- .../sw/device/snax/data-reshuffler/Makefile | 32 - .../include/snax-data-reshuffler-lib.h | 57 -- .../src/snax-data-reshuffler-lib.c | 155 --- .../snax/{streamer-simd => gemmx}/Makefile | 5 +- .../include/snax-gemmx-lib.h} | 25 +- .../snax/gemmx/include/snax-gemmx-params.h | 11 + .../gemmx/include/streamer_csr_addr_map.h | 73 ++ .../sw/device/snax/gemmx/src/snax-gemmx-lib.c | 262 +++++ .../snax/streamer-gemm-conv-simd/Makefile | 32 - .../src/snax-streamer-gemm-conv-simd-lib.c | 224 ----- .../device/snax/streamer-gemm-conv/Makefile | 21 - .../include/snax-streamer-gemm-conv-lib.h | 52 - .../src/snax-streamer-gemm-conv-lib.c | 139 --- .../sim/sw/device/snax/streamer-gemm/Makefile | 32 - .../include/snax-streamer-gemm-lib.h | 36 - .../src/snax-streamer-gemm-lib.c | 101 -- .../include/snax-streamer-simd-lib.h | 66 -- .../src/snax-streamer-simd-lib.c | 165 ---- target/sim/sw/host/apps/offload/Makefile | 4 +- target/sim/sw/sim_elf.yaml | 1 + util/sim/snax_utils.py | 427 --------- 39 files changed, 1415 insertions(+), 3494 deletions(-) delete mode 100755 target/sim/sw/device/apps/snax/snax-data-reshuffler/data/datagen.py delete mode 100644 target/sim/sw/device/apps/snax/snax-data-reshuffler/data/params.hjson delete mode 100644 target/sim/sw/device/apps/snax/snax-data-reshuffler/src/snax-data-reshuffler.c rename target/sim/sw/device/apps/snax/{snax-data-reshuffler => snax-gemmx}/Makefile (61%) rename target/sim/sw/device/apps/snax/{snax-data-reshuffler => snax-gemmx}/data/Makefile (100%) create mode 100755 target/sim/sw/device/apps/snax/snax-gemmx/data/datagen.py create mode 100644 target/sim/sw/device/apps/snax/snax-gemmx/data/params.hjson create mode 100644 target/sim/sw/device/apps/snax/snax-gemmx/src/snax-gemmx.c delete mode 100644 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/Makefile delete mode 100644 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/Makefile delete mode 100755 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/datagen.py delete mode 100644 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/params.hjson delete mode 100644 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/src/snax-streamer-gemm-conv-simd.c delete mode 100755 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/Makefile delete mode 100755 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/Makefile delete mode 100755 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/datagen.py delete mode 100755 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/params.hjson delete mode 100755 target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/src/snax-streamer-gemm-conv.c delete mode 100644 target/sim/sw/device/snax/data-reshuffler/Makefile delete mode 100644 target/sim/sw/device/snax/data-reshuffler/include/snax-data-reshuffler-lib.h delete mode 100644 target/sim/sw/device/snax/data-reshuffler/src/snax-data-reshuffler-lib.c rename target/sim/sw/device/snax/{streamer-simd => gemmx}/Makefile (91%) rename target/sim/sw/device/snax/{streamer-gemm-conv-simd/include/snax-streamer-gemm-conv-simd-lib.h => gemmx/include/snax-gemmx-lib.h} (76%) create mode 100644 target/sim/sw/device/snax/gemmx/include/snax-gemmx-params.h create mode 100644 target/sim/sw/device/snax/gemmx/include/streamer_csr_addr_map.h create mode 100644 target/sim/sw/device/snax/gemmx/src/snax-gemmx-lib.c delete mode 100644 target/sim/sw/device/snax/streamer-gemm-conv-simd/Makefile delete mode 100644 target/sim/sw/device/snax/streamer-gemm-conv-simd/src/snax-streamer-gemm-conv-simd-lib.c delete mode 100755 target/sim/sw/device/snax/streamer-gemm-conv/Makefile delete mode 100755 target/sim/sw/device/snax/streamer-gemm-conv/include/snax-streamer-gemm-conv-lib.h delete mode 100755 target/sim/sw/device/snax/streamer-gemm-conv/src/snax-streamer-gemm-conv-lib.c delete mode 100644 target/sim/sw/device/snax/streamer-gemm/Makefile delete mode 100644 target/sim/sw/device/snax/streamer-gemm/include/snax-streamer-gemm-lib.h delete mode 100644 target/sim/sw/device/snax/streamer-gemm/src/snax-streamer-gemm-lib.c delete mode 100644 target/sim/sw/device/snax/streamer-simd/include/snax-streamer-simd-lib.h delete mode 100644 target/sim/sw/device/snax/streamer-simd/src/snax-streamer-simd-lib.c delete mode 100644 util/sim/snax_utils.py diff --git a/target/sim/sw/device/apps/snax/snax-data-reshuffler/data/datagen.py b/target/sim/sw/device/apps/snax/snax-data-reshuffler/data/datagen.py deleted file mode 100755 index 1ef55fc23..000000000 --- a/target/sim/sw/device/apps/snax/snax-data-reshuffler/data/datagen.py +++ /dev/null @@ -1,511 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2023 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -import numpy as np -import argparse -import pathlib -import hjson -import sys -import os - -# Add data utility path -sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/")) -from data_utils import format_scalar_definition, format_vector_definition # noqa E402 - -# Add golden model path -from snax_utils import data_reshuffler_golden_model, max_pooling # noqa E402 - -np.random.seed(42) - - -# Add stdint.h header -def emit_header_file(**kwargs): - emit_str = "#include \n\n" - emit_str += "#include \n\n" - emit_str += emit_gemm_data(**kwargs) - return emit_str - - -MIN = -128 -MAX = 127 - - -def emit_gemm_data(**kwargs): - data_str = [] - - if kwargs["ifMaxPool"] is False: - # Generating loop bounds settings - data_str += [ - format_scalar_definition("int8_t", "tempLoop0_in", kwargs["tempLoop0"]), - format_scalar_definition("int8_t", "tempLoop1_in", kwargs["tempLoop1"]), - format_scalar_definition("int8_t", "tempLoop2_in", 1), - format_scalar_definition("int8_t", "tempLoop3_in", 1), - format_scalar_definition("int8_t", "tempLoop4_in", 1), - format_scalar_definition("int8_t", "tempLoop0_out", kwargs["tempLoop0"]), - format_scalar_definition("int8_t", "tempLoop1_out", kwargs["tempLoop1"]), - format_scalar_definition("int8_t", "tempLoop2_out", 1), - format_scalar_definition( - "int32_t", - "input_data_len", - kwargs["tempLoop0"] * kwargs["tempLoop1"] * 8 * 8, - ), - format_scalar_definition( - "int32_t", - "output_data_len", - kwargs["tempLoop0"] * kwargs["tempLoop1"] * 8 * 8, - ), - ] - - # Generating temporal strides settings - # DMA strides (from L3 to L1) - data_str += [ - format_scalar_definition( - "int32_t", "DMAtempStride0_in", kwargs["DMAtempStride0_in"] - ), - format_scalar_definition( - "int32_t", "DMAtempStride1_in", kwargs["DMAtempStride1_in"] - ), - format_scalar_definition( - "int32_t", "DMAspatialStride1_in", kwargs["DMAspatialStride1_in"] - ), - # data reshuffler input strides - format_scalar_definition( - "int32_t", "tempStride0_in", kwargs["tempStride0_in"] - ), - format_scalar_definition( - "int32_t", "tempStride1_in", kwargs["tempStride1_in"] - ), - format_scalar_definition("int32_t", "tempStride2_in", 0), - format_scalar_definition("int32_t", "tempStride3_in", 0), - format_scalar_definition("int32_t", "tempStride4_in", 0), - format_scalar_definition( - "int32_t", "spatialStride1_in", kwargs["spatialStride1_in"] - ), - # data reshuffler output strides - format_scalar_definition( - "int32_t", - "tempStride0_out", - kwargs["tempStride0_out"], - ), - format_scalar_definition( - "int32_t", "tempStride1_out", kwargs["tempStride1_out"] - ), - format_scalar_definition("int32_t", "tempStride2_out", 0), - format_scalar_definition( - "int32_t", "spatialStride1_out", kwargs["spatialStride1_out"] - ), - # Generating base address pointers - format_scalar_definition( - "int32_t", "delta_local_in", kwargs["delta_local_in"] - ), - format_scalar_definition( - "int32_t", "delta_local_out", kwargs["delta_local_out"] - ), - ] - - # Generating random input data vector - length_in = ( - kwargs["tempLoop0"] - * kwargs["tempLoop1"] - * kwargs["spatial_len_0"] - * kwargs["spatial_len_1"] - ) - - data_in = np.random.randint(MIN, MAX, length_in) - - op = kwargs["op"] - - # Generating golden data - # NOTE: using 4 loops to iterate through the - # input data and reshuffle the data. - # different from the hardware data reshuffler, - # the golden model uses the pure strided layout mapping equation, - # no 64 data granularity constraint, no need to transpose explicitly. - if op == "rowmajor2tiledrowmajor": - c_golden = data_reshuffler_golden_model( - kwargs["tempLoop0"], - kwargs["tempLoop1"], - kwargs["spatial_len_0"], - kwargs["spatial_len_1"], - kwargs["tempStride0_in"], - kwargs["tempStride1_in"], - 1, - kwargs["spatialStride1_in"], - data_in, - ) - - if op == "rowmajor2tiledcolmajor": - c_golden = data_reshuffler_golden_model( - kwargs["tempLoop0"], - kwargs["tempLoop1"], - kwargs["spatial_len_0"], - kwargs["spatial_len_1"], - kwargs["tempStride0_in"], - kwargs["tempStride1_in"], - kwargs["tempLoop0"] * 8, - 1, - data_in, - ) - - if op == "tiledrowmajor2tiledcolmajor": - c_golden = data_reshuffler_golden_model( - kwargs["tempLoop0"], - kwargs["tempLoop1"], - kwargs["spatial_len_0"], - kwargs["spatial_len_1"], - kwargs["tempStride0_in"], - kwargs["tempStride1_in"], - 8, - 1, - data_in, - ) - - # Generating transpose flag for the data reshuffler hardware - if op == "rowmajor2tiledrowmajor": - transpose = 0 - elif op == "rowmajor2tiledcolmajor": - transpose = 1 - elif op == "tiledrowmajor2tiledcolmajor": - transpose = 1 - else: - print("Invalid operation") - - # set transpose or not - data_str += [ - format_scalar_definition( - "int", "TloopLen", kwargs["tempLoop0"] * kwargs["tempLoop1"] - ) - ] - data_str += [format_scalar_definition("int", "reduceLen", 1)] - data_str += [format_scalar_definition("int", "opcode", transpose)] - - # Writing testing data and golden data into data.h - data_str += [format_vector_definition("int8_t", "DataIn", data_in)] - data_str += [format_vector_definition("int8_t", "C_golden", c_golden)] - - data_str = "\n\n".join(data_str) - - elif kwargs["ifC8HW8datalayout"] is True: - # data layout, C8HW8 - # Generating loop bounds settings - padded_input_tensor_w = kwargs["W"] + kwargs["pad_w"] * 2 - padded_input_tensor_h = kwargs["H"] + kwargs["pad_h"] * 2 - - padded_output_tensor_w = ( - kwargs["W"] + kwargs["pad_w"] * 2 - kwargs["Kw"] - ) // kwargs["stride_w"] + 1 - padded_output_tensor_h = ( - kwargs["H"] + kwargs["pad_h"] * 2 - kwargs["Kh"] - ) // kwargs["stride_h"] + 1 - - input_data_len = padded_input_tensor_w * padded_input_tensor_h * kwargs["Cin"] - output_data_len = ( - padded_output_tensor_w * padded_output_tensor_h * kwargs["Cin"] - ) - - assert padded_output_tensor_w == kwargs["W"] - assert padded_output_tensor_h == kwargs["H"] - - data_str += [ - # input data reshuffler loop bounds settings - format_scalar_definition("int8_t", "tempLoop0_in", kwargs["Kw"]), - format_scalar_definition("int8_t", "tempLoop1_in", kwargs["Kh"]), - format_scalar_definition( - "int8_t", "tempLoop2_in", padded_output_tensor_w // 8 - ), - format_scalar_definition("int8_t", "tempLoop3_in", padded_output_tensor_h), - format_scalar_definition("int8_t", "tempLoop4_in", kwargs["Cin"] // 8), - # output data reshuffler loop bounds settings - format_scalar_definition( - "int8_t", "tempLoop0_out", padded_output_tensor_w // 8 - ), - format_scalar_definition("int8_t", "tempLoop1_out", padded_output_tensor_h), - format_scalar_definition("int8_t", "tempLoop2_out", kwargs["Cin"] // 8), - # data length setting - format_scalar_definition("int32_t", "input_data_len", input_data_len), - format_scalar_definition("int32_t", "output_data_len", output_data_len), - format_scalar_definition( - "int32_t", - "TloopLen", - padded_output_tensor_w - * padded_output_tensor_h - * kwargs["Cin"] - // 8 - // 8, - ), - format_scalar_definition( - "int32_t", "reduceLen", kwargs["Kw"] * kwargs["Kh"] - ), - ] - - assert padded_output_tensor_w * 8 == 8 * 8 * (padded_output_tensor_w // 8) - data_str += [ - # data reshuffler input strides - format_scalar_definition("int32_t", "spatialStride1_in", 8), - format_scalar_definition( - "int32_t", "tempStride0_in", kwargs["stride_w"] * 8 - ), - format_scalar_definition( - "int32_t", "tempStride1_in", padded_input_tensor_w * 8 - ), - format_scalar_definition("int32_t", "tempStride2_in", 8 * 8), - format_scalar_definition( - "int32_t", - "tempStride3_in", - padded_input_tensor_w * 8 * kwargs["stride_h"], - ), - format_scalar_definition( - "int32_t", - "tempStride4_in", - padded_input_tensor_w * padded_input_tensor_h * 8, - ), - # data reshuffler output strides - format_scalar_definition("int32_t", "spatialStride1_out", 8), - format_scalar_definition( - "int32_t", - "tempStride0_out", - 8 * 8, - ), - format_scalar_definition( - "int32_t", "tempStride1_out", padded_output_tensor_w * 8 - ), - format_scalar_definition( - "int32_t", - "tempStride2_out", - padded_output_tensor_w * padded_output_tensor_h * 8, - ), - # Generating base address pointers - format_scalar_definition("int32_t", "delta_local_in", 0), - format_scalar_definition( - "int32_t", - "delta_local_out", - padded_input_tensor_h * padded_input_tensor_w * kwargs["Cin"], - ), - ] - - # Generating random input data vector - data_in = np.random.randint( - MIN, MAX, (kwargs["Cin"] // 8, kwargs["H"], kwargs["W"], 8) - ) - - # Generating golden data - c_golden = max_pooling( - data_in, - kwargs["Kw"], - kwargs["Kh"], - kwargs["stride_w"], - kwargs["stride_h"], - kwargs["pad_w"], - kwargs["pad_h"], - "C8HW8", - ) - - padded_data_in = np.pad( - data_in, - ( - (0, 0), - (kwargs["pad_h"], kwargs["pad_h"]), - (kwargs["pad_w"], kwargs["pad_w"]), - (0, 0), - ), - "constant", - ) - - # set opcode - data_str += [format_scalar_definition("int", "opcode", 2)] - - # Writing testing data and golden data into data.h - assert padded_data_in.shape == ( - kwargs["Cin"] // 8, - padded_input_tensor_h, - padded_input_tensor_w, - 8, - ) - assert padded_data_in.reshape(-1).shape[0] == input_data_len - data_str += [ - format_vector_definition("int8_t", "DataIn", padded_data_in.reshape(-1)) - ] - - assert c_golden.shape == ( - kwargs["Cin"] // 8, - padded_output_tensor_h, - padded_output_tensor_w, - 8, - ) - assert c_golden.reshape(-1).shape[0] == output_data_len - - data_str += [ - format_vector_definition("int8_t", "C_golden", c_golden.reshape(-1)) - ] - - data_str = "\n\n".join(data_str) - else: - # data layout HWCin - # Generating loop bounds settings - padded_input_tensor_w = kwargs["W"] + kwargs["pad_w"] * 2 - padded_input_tensor_h = kwargs["H"] + kwargs["pad_h"] * 2 - - padded_output_tensor_w = ( - kwargs["W"] + kwargs["pad_w"] * 2 - kwargs["Kw"] - ) // kwargs["stride_w"] + 1 - padded_output_tensor_h = ( - kwargs["H"] + kwargs["pad_h"] * 2 - kwargs["Kh"] - ) // kwargs["stride_h"] + 1 - - input_data_len = padded_input_tensor_w * padded_input_tensor_h * kwargs["Cin"] - output_data_len = ( - padded_output_tensor_w * padded_output_tensor_h * kwargs["Cin"] - ) - - assert padded_output_tensor_w == kwargs["W"] - assert padded_output_tensor_h == kwargs["H"] - - data_str += [ - # input data reshuffler loop bounds settings - format_scalar_definition("int8_t", "tempLoop0_in", kwargs["Kw"]), - format_scalar_definition("int8_t", "tempLoop1_in", kwargs["Kh"]), - format_scalar_definition("int8_t", "tempLoop2_in", kwargs["Cin"] // 8), - format_scalar_definition( - "int8_t", "tempLoop3_in", padded_output_tensor_w // 8 - ), - format_scalar_definition("int8_t", "tempLoop4_in", padded_output_tensor_h), - # output data reshuffler loop bounds settings - format_scalar_definition("int8_t", "tempLoop0_out", kwargs["Cin"] // 8), - format_scalar_definition( - "int8_t", "tempLoop1_out", padded_output_tensor_w // 8 - ), - format_scalar_definition("int8_t", "tempLoop2_out", padded_output_tensor_h), - # data length setting - format_scalar_definition("int32_t", "input_data_len", input_data_len), - format_scalar_definition("int32_t", "output_data_len", output_data_len), - format_scalar_definition( - "int32_t", - "TloopLen", - padded_output_tensor_w - * padded_output_tensor_h - * kwargs["Cin"] - // 8 - // 8, - ), - format_scalar_definition( - "int32_t", "reduceLen", kwargs["Kw"] * kwargs["Kh"] - ), - ] - - data_str += [ - # data reshuffler input strides - format_scalar_definition("int32_t", "spatialStride1_in", kwargs["Cin"]), - format_scalar_definition( - "int32_t", "tempStride0_in", kwargs["stride_w"] * kwargs["Cin"] - ), - format_scalar_definition( - "int32_t", "tempStride1_in", padded_input_tensor_w * kwargs["Cin"] - ), - format_scalar_definition("int32_t", "tempStride2_in", 8), - format_scalar_definition("int32_t", "tempStride3_in", 8 * kwargs["Cin"]), - format_scalar_definition( - "int32_t", "tempStride4_in", padded_input_tensor_w * kwargs["Cin"] - ), - # data reshuffler output strides - format_scalar_definition("int32_t", "spatialStride1_out", kwargs["Cin"]), - format_scalar_definition("int32_t", "tempStride0_out", 8), - format_scalar_definition("int32_t", "tempStride1_out", 8 * kwargs["Cin"]), - format_scalar_definition( - "int32_t", "tempStride2_out", padded_output_tensor_w * kwargs["Cin"] - ), - # Generating base address pointers - format_scalar_definition("int32_t", "delta_local_in", 0), - format_scalar_definition( - "int32_t", - "delta_local_out", - padded_input_tensor_h * padded_input_tensor_w * kwargs["Cin"], - ), - ] - - # Generating random input data vector - data_in = np.random.randint( - MIN, MAX, (1, kwargs["H"], kwargs["W"], kwargs["Cin"]) - ) - - # Generating golden data - c_golden = max_pooling( - data_in, - kwargs["Kw"], - kwargs["Kh"], - kwargs["stride_w"], - kwargs["stride_h"], - kwargs["pad_w"], - kwargs["pad_h"], - "HWC", - ) - - padded_data_in = np.pad( - data_in, - ( - (0, 0), - (kwargs["pad_h"], kwargs["pad_h"]), - (kwargs["pad_w"], kwargs["pad_w"]), - (0, 0), - ), - "constant", - ) - - # set opcode - data_str += [format_scalar_definition("int", "opcode", 2)] - - # Writing testing data and golden data into data.h - assert padded_data_in.shape == ( - 1, - padded_input_tensor_h, - padded_input_tensor_w, - kwargs["Cin"], - ) - assert padded_data_in.reshape(-1).shape[0] == input_data_len - data_str += [ - format_vector_definition("int8_t", "DataIn", padded_data_in.reshape(-1)) - ] - - assert c_golden.shape == ( - 1, - padded_output_tensor_h, - padded_output_tensor_w, - kwargs["Cin"], - ) - assert c_golden.reshape(-1).shape[0] == output_data_len - - data_str += [ - format_vector_definition("int8_t", "C_golden", c_golden.reshape(-1)) - ] - - data_str = "\n\n".join(data_str) - - return data_str - - -def main(): - # Parsing cmd args - parser = argparse.ArgumentParser(description="Generating data for kernels") - parser.add_argument( - "-c", - "--cfg", - type=pathlib.Path, - required=True, - help="Select param config file kernel", - ) - args = parser.parse_args() - - # Load param config file - with args.cfg.open() as f: - param = hjson.loads(f.read()) - - # Emit header file - print(emit_header_file(**param)) - - -if __name__ == "__main__": - main() diff --git a/target/sim/sw/device/apps/snax/snax-data-reshuffler/data/params.hjson b/target/sim/sw/device/apps/snax/snax-data-reshuffler/data/params.hjson deleted file mode 100644 index a7225abc6..000000000 --- a/target/sim/sw/device/apps/snax/snax-data-reshuffler/data/params.hjson +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -{ - // ifMaxPool: false, - // ifMaxPool: true, - ifMaxPool: false, - - // parameters for maxpool - // ifC8HW8datalayout: true, - ifC8HW8datalayout: false, - Nbatch: 1, - H: 32, - W: 32, - Cin: 8, - - Kh: 3, - Kw: 3, - pad_h: 1, - pad_w: 1, - stride_h: 1, - stride_w: 1, - - // parameters for data layout reshuffling - op: 'rowmajor2tiledrowmajor', - tempLoop0: 8, - tempLoop1: 8, - DMAspatialStride1_in: 8, - DMAtempStride0_in: 64, - DMAtempStride1_in: 512, - spatialStride1_in: 64, - tempStride0_in: 8, - tempStride1_in: 512, - spatialStride1_out: 8, - tempStride0_out: 64, - tempStride1_out: 512, - delta_local_in: 0, - delta_local_out: 4096, - spatial_len_0: 8, - spatial_len_1: 8 -} diff --git a/target/sim/sw/device/apps/snax/snax-data-reshuffler/src/snax-data-reshuffler.c b/target/sim/sw/device/apps/snax/snax-data-reshuffler/src/snax-data-reshuffler.c deleted file mode 100644 index 3b6ec4cbb..000000000 --- a/target/sim/sw/device/apps/snax/snax-data-reshuffler/src/snax-data-reshuffler.c +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include "data.h" -#include "printf.h" - -#include "snax-data-reshuffler-lib.h" - -int main() { - uint32_t CORE_IDX = snrt_cluster_core_idx(); - - // Set err value for checking - int err = 0; - - // Prepare addresses in TCDM - int8_t* local_in; - int8_t* local_out; - - // Allocate space in TCDM - local_in = (int8_t*)(snrt_l1_next() + delta_local_in); - local_out = (int8_t*)(snrt_l1_next() + delta_local_out); - - // uint32_t dma_pre_load = snrt_mcycle(); - - // Transfer data from L3 to L1 - // Using DMA only - if (snrt_is_dm_core()) { - load_a_chrunk_of_data(local_in, DataIn, input_data_len); - } - - // Wait for DMA to finish - snrt_cluster_hw_barrier(); - - uint32_t data_reshuffler_cycle; - - if (CORE_IDX == 0) { - // Set data-reshuffler configuration CSR - set_data_reshuffler_csr( - tempLoop0_in, tempLoop1_in, tempLoop2_in, tempLoop3_in, - tempLoop4_in, tempStride0_in, tempStride1_in, tempStride2_in, - tempStride3_in, tempStride4_in, spatialStride1_in, tempLoop0_out, - tempLoop1_out, tempLoop2_out, tempStride0_out, tempStride1_out, - tempStride2_out, spatialStride1_out, (int32_t)delta_local_in, - (int32_t)delta_local_out); - - start_streamer(); - - // Set CSR to start data-reshuffler - set_data_reshuffler(TloopLen, reduceLen, opcode); - start_data_reshuffler(); - - // Wait for data-reshuffler to finish - wait_data_reshuffler(); - wait_streamer(); - - // Compare SNAX data-reshuffler result with golden python model - err += test_a_chrunk_of_data(local_out, C_golden, output_data_len); - printf("Data reshuffler finished. Error: %d \n", err); - }; - - return err; -} diff --git a/target/sim/sw/device/apps/snax/snax-data-reshuffler/Makefile b/target/sim/sw/device/apps/snax/snax-gemmx/Makefile similarity index 61% rename from target/sim/sw/device/apps/snax/snax-data-reshuffler/Makefile rename to target/sim/sw/device/apps/snax/snax-gemmx/Makefile index 62b0e3c62..cf7cd70a1 100644 --- a/target/sim/sw/device/apps/snax/snax-data-reshuffler/Makefile +++ b/target/sim/sw/device/apps/snax/snax-gemmx/Makefile @@ -4,16 +4,16 @@ # # Xiaoling Yi -APP = snax-data-reshuffler +APP = snax-gemmx INCDIRS = data -INCDIRS += ../../../snax/data-reshuffler/include +INCDIRS += ../../../snax/gemmx/include # Include this binary in the final build -RISCV_LDFLAGS += ../../../snax/data-reshuffler/build/snax-data-reshuffler-lib.o +RISCV_LDFLAGS += ../../../snax/gemmx/build/snax-gemmx-lib.o -SRCS = src/snax-data-reshuffler.c +SRCS = src/snax-gemmx.c include ./data/Makefile include ../../common.mk diff --git a/target/sim/sw/device/apps/snax/snax-data-reshuffler/data/Makefile b/target/sim/sw/device/apps/snax/snax-gemmx/data/Makefile similarity index 100% rename from target/sim/sw/device/apps/snax/snax-data-reshuffler/data/Makefile rename to target/sim/sw/device/apps/snax/snax-gemmx/data/Makefile diff --git a/target/sim/sw/device/apps/snax/snax-gemmx/data/datagen.py b/target/sim/sw/device/apps/snax/snax-gemmx/data/datagen.py new file mode 100755 index 000000000..44b6d997d --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-gemmx/data/datagen.py @@ -0,0 +1,903 @@ +#!/usr/bin/env python3 + +# Copyright 2024 KU Leuven. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Xiaoling Yi + +import numpy as np +import argparse +import pathlib +import hjson +import sys +import os +import subprocess + +# Add data utility path +sys.path.append(os.path.join(os.path.dirname(__file__), + "../../../../../../../../util/sim/")) +from data_utils import format_scalar_definition, format_vector_definition # noqa E402 + +# ----------------------- +# Add hypercorex utility paths +# ----------------------- +bender_command = subprocess.run(['bender', 'path', 'snitch_cluster'], + capture_output=True, text=True) +snax_utils_path = bender_command.stdout.strip() + +sys.path.append(snax_utils_path + "/util/sim/") + +# Add golden model path +from snax_utils import ( # noqa E402 + conv2d, + im2col, + block_gemm_golden_model, + data_reshuffler_golden_model, + postprocessing_simd_golden_model, + align_wide_addr, +) # noqa E402 + +np.random.seed(42) + + +# Add stdint.h header +def emit_header_file(**kwargs): + emit_str = "#include \n\n" + emit_str += emit_gemmx_data(**kwargs) + return emit_str + + +MIN = -128 +MAX = 127 + +bankWidth = 64 +input_data_width = 8 +output_data_width = 32 +quantized_output_data_width = 8 + + +def emit_conv_data(**kwargs): + Cin = kwargs["Cin"] + Cout = kwargs["Cout"] + if kwargs["ifC8HW8datalayout"] is True: + Nbatch, Cin8, H, W, _ = ( + kwargs["Nbatch"], + kwargs["Cin"] // 8, + kwargs["H"], + kwargs["W"], + 8, + ) + Cout8, Cin8, Kh, Kw, _, _ = ( + kwargs["Cout"] // 8, + kwargs["Cin"] // 8, + kwargs["Kh"], + kwargs["Kw"], + 8, + 8, + ) + + # test data generation + input_data = np.random.randint(-10, 10, size=(Nbatch, Cin8, H, W, 8)) + kernel = np.random.randint(-10, 10, size=(Cout8, Cin8, Kh, Kw, 8, 8)) + else: + # conv2d settings + Nbatch, H, W, Cin = (kwargs["Nbatch"], kwargs["H"], kwargs["W"], kwargs["Cin"]) + Cout, Kh, Kw, Cin = (kwargs["Cout"], kwargs["Kh"], kwargs["Kw"], kwargs["Cin"]) + + # test data generation + input_data = np.random.randint(MIN, MAX, size=(Nbatch, H, W, Cin)) + kernel = np.random.randint(MIN, MAX, size=(Cout, Kh, Kw, Cin)) + stride_h, stride_w = (kwargs["stride_h"], kwargs["stride_w"]) + pad_h, pad_w = (kwargs["pad_h"], kwargs["pad_w"]) + + # inferred config from the input data and kernel + padding = pad_h, pad_w + stride = stride_h, stride_w + + # Padding the input data + + if kwargs["ifC8HW8datalayout"] is True: + input_padding = np.pad( + input_data, + ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), + mode="constant", + ) + else: + input_padding = np.pad( + input_data, + ((0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), + mode="constant", + ) + + # Calculate the size of the output feature map + out_height = (H + 2 * pad_h - Kh) // stride_h + 1 + out_width = (W + 2 * pad_w - Kw) // stride_w + 1 + + assert out_width % 8 == 0, "out_width must be multiple of 8" + + M = out_height * out_width // 8 + K = Cin // 8 * Kh * Kw + N = Cout // 8 + + length_c = M * N * 8 * 8 + # bias = np.random.randint(MIN, MAX, length_c) + bias = np.random.randint(-(2**30), 2**30 - 1, length_c) + + data_str = [] + + # Generating conv2d settings + data_str += [ + format_scalar_definition("int", "Nbatch", Nbatch), + format_scalar_definition("int", "H", H), + format_scalar_definition("int", "W", W), + format_scalar_definition("int", "Cin", Cin), + format_scalar_definition("int", "Cout", Cout), + format_scalar_definition("int", "Kh", Kh), + format_scalar_definition("int", "Kw", Kw), + format_scalar_definition("int", "stride_h", stride_h), + format_scalar_definition("int", "stride_w", stride_w), + format_scalar_definition("int", "pad_h", pad_h), + format_scalar_definition("int", "pad_w", pad_w), + ] + + # Generating matrix size settings + data_str += [ + format_scalar_definition("int", "Batch", Nbatch), + format_scalar_definition("int", "M", M), + format_scalar_definition("int", "K", K), + format_scalar_definition("int", "N", N), + ] + + # Generating base pointer settings + delta_local_a = 0 + + delta_local_b = input_padding.size + assert input_padding.size == (Nbatch * Cin8 * (H + 2 * pad_h) * (W + 2 * pad_w) * 8) + + delta_local_b = align_wide_addr(delta_local_b, 64) + assert delta_local_b % 64 == 0 + + delta_local_c = delta_local_b + kernel.size + assert kernel.size == (Cout8 * Cin8 * Kh * Kw * 8 * 8) + delta_local_c = align_wide_addr(delta_local_c, 64) + assert delta_local_c % 64 == 0 + + delta_local_d8 = delta_local_c + length_c * 4 + delta_local_d8 = align_wide_addr(delta_local_d8, 64) + assert delta_local_d8 % 64 == 0 + + delta_local_d32 = delta_local_d8 + data_str += [ + format_scalar_definition("int32_t", "delta_local_a", delta_local_a), + format_scalar_definition("int32_t", "delta_local_b", delta_local_b), + format_scalar_definition("int32_t", "delta_local_d8", delta_local_d8), + format_scalar_definition("int32_t", "delta_local_c", delta_local_c), + format_scalar_definition("int32_t", "delta_local_d32", delta_local_d32), + ] + + # for streamer cfg + # streamer setting for data mover A + if kwargs["ifC8HW8datalayout"] is True: + # NC8HW8 + Aslstride0 = 1 + Aslstride1 = 8 * stride_w + + # K dim + Atlbound0 = Kw + Atlstride0 = 8 + + Atlbound1 = Kh + Atlstride1 = 8 * (W + 2 * pad_w) + + Atlbound2 = Cin8 + Atlstride2 = 8 * (W + 2 * pad_w) * (H + 2 * pad_h) + + # N dim + Atlbound3 = Cout // 8 + Atlstride3 = 0 + + # M dim + Atlbound4 = out_width // 8 + Atlstride4 = 8 * 8 * stride_w + + Atlbound5 = out_height + Atlstride5 = 8 * (W + 2 * pad_w) * stride_h + + # Batch dim + Atlbound6 = Nbatch + Atlstride6 = 8 * Cin8 * (H + 2 * pad_h) * (W + 2 * pad_w) + + else: + # NHWC + Aslstride0 = 1 + Aslstride1 = Cin * stride_w + + # K dim + Atlbound0 = Cin // 8 + Atlstride0 = 8 + + Atlbound1 = Kw + Atlstride1 = Cin + + Atlbound2 = Kh + Atlstride2 = Cin * (W + 2 * pad_w) + + # N dim + Atlbound3 = Cout // 8 + Atlstride3 = 0 + + # M dim + Atlbound4 = out_width // 8 + Atlstride4 = Cin * 8 + + Atlbound5 = out_height + Atlstride5 = Cin * (W + 2 * pad_w) * stride_h + + # Batch dim + Atlbound6 = Nbatch + Atlstride6 = Cin * (H + 2 * pad_h) * (W + 2 * pad_w) + + assert ( + Atlstride0 % 8 == 0 + and Atlstride1 % 8 == 0 + and Atlstride2 % 8 == 0 + and Atlstride3 % 8 == 0 + and Atlstride4 % 8 == 0 + and Atlstride5 % 8 == 0 + and Atlstride6 % 8 == 0 + ) + + assert ( + M * K * N + == Atlbound0 + * Atlbound1 + * Atlbound2 + * Atlbound3 + * Atlbound4 + * Atlbound5 + * Atlbound6 + ) + + data_str += [ + format_scalar_definition("int32_t", "Aslstride0", Aslstride0), + format_scalar_definition("int32_t", "Aslstride1", Aslstride1), + format_scalar_definition("int32_t", "Atlbound0", Atlbound0), + format_scalar_definition("int32_t", "Atlstride0", Atlstride0), + format_scalar_definition("int32_t", "Atlbound1", Atlbound1), + format_scalar_definition("int32_t", "Atlstride1", Atlstride1), + format_scalar_definition("int32_t", "Atlbound2", Atlbound2), + format_scalar_definition("int32_t", "Atlstride2", Atlstride2), + format_scalar_definition("int32_t", "Atlbound3", Atlbound3), + format_scalar_definition("int32_t", "Atlstride3", Atlstride3), + format_scalar_definition("int32_t", "Atlbound4", Atlbound4), + format_scalar_definition("int32_t", "Atlstride4", Atlstride4), + format_scalar_definition("int32_t", "Atlbound5", Atlbound5), + format_scalar_definition("int32_t", "Atlstride5", Atlstride5), + format_scalar_definition("int32_t", "Atlbound6", Atlbound6), + format_scalar_definition("int32_t", "Atlstride6", Atlstride6), + ] + + if kwargs["ifC8HW8datalayout"] is True: + # Cout8Cin8FyFx88 + # streamer setting for data mover B + Bslstride0 = 1 + Bslstride1 = 8 + + # K dim + Btlbound0 = Kw * Kh * Cin8 + Btlstride0 = 8 * 8 + + # N dim + Btlbound1 = Cout // 8 + Btlstride1 = 8 * 8 * Kw * Kh * Cin8 + + # M dim + Btlbound2 = out_width * out_height // 8 + Btlstride2 = 0 + + # Batch dim + Btlbound3 = Nbatch + Btlstride3 = 0 + else: + # NCoutFyFxCin + # streamer setting for data mover B + Bslstride0 = 1 + Bslstride1 = Cin * Kw * Kh + + # K dim + Btlbound0 = Cin * Kw * Kh // 8 + Btlstride0 = 8 + + # N dim + Btlbound1 = Cout // 8 + Btlstride1 = Cin * Kw * Kh * 8 + + # M dim + Btlbound2 = out_width * out_height // 8 + Btlstride2 = 0 + + # Batch dim + Btlbound3 = Nbatch + Btlstride3 = 0 + + assert ( + Btlstride0 % 64 == 0 + and Btlstride1 % 64 == 0 + and Btlstride2 % 64 == 0 + and Btlstride3 % 64 == 0 + ) + + assert K * N * M == Btlbound0 * Btlbound1 * Btlbound2 * Btlbound3, ( + "K * N * M", + K * N * M, + "Loopbounds multipliers ", + Btlbound0 * Btlbound1 * Btlbound2 * Btlbound3, + ) + + data_str += [ + format_scalar_definition("int32_t", "Bslstride0", Bslstride0), + format_scalar_definition("int32_t", "Bslstride1", Bslstride1), + format_scalar_definition("int32_t", "Btlbound0", Btlbound0), + format_scalar_definition("int32_t", "Btlstride0", Btlstride0), + format_scalar_definition("int32_t", "Btlbound1", Btlbound1), + format_scalar_definition("int32_t", "Btlstride1", Btlstride1), + format_scalar_definition("int32_t", "Btlbound2", Btlbound2), + format_scalar_definition("int32_t", "Btlstride2", Btlstride2), + format_scalar_definition("int32_t", "Btlbound3", Btlbound3), + format_scalar_definition("int32_t", "Btlstride3", Btlstride3), + ] + + # streamer setting for data mover C + # C is int32_t so the stride is 4 times of the int8_t + if kwargs["ifC8HW8datalayout"] is True: + # NHWC + Cslstride0 = 4 + Cslstride1 = 8 + + # N dim + Ctlbound0 = Cout // 8 + Ctlstride0 = out_height * out_width // 8 * 8 * 8 * 4 + + # M dim + # K is merged because of the block gemm output stationarity + Ctlbound1 = out_width // 8 + Ctlstride1 = 8 * 8 * 4 + + Ctlbound2 = out_height + Ctlstride2 = out_width // 8 * 8 * 8 * 4 + + # Batch dim + Ctlbound3 = Nbatch + Ctlstride3 = Cout * out_height * out_width * 4 + + else: + Cslstride0 = 4 + Cslstride1 = 8 + + # N dim + Ctlbound0 = Cout // 8 + Ctlstride0 = 8 * 4 + + # M dim + # K is merged because of the block gemm output stationarity + Ctlbound1 = out_width // 8 + Ctlstride1 = Cout * 8 * 4 + + Ctlbound2 = out_height + Ctlstride2 = Cout * W * 4 + + # Batch dim + Ctlbound3 = Nbatch + Ctlstride3 = Cout * H * W * 4 + + assert ( + Ctlstride0 % 64 == 0 + and Ctlstride1 % 64 == 0 + and Ctlstride2 % 64 == 0 + and Ctlstride3 % 64 == 0 + ) + assert M * N == Ctlbound0 * Ctlbound1 * Ctlbound2 * Ctlbound3 + + data_str += [ + format_scalar_definition("int32_t", "Cslstride0", Cslstride0), + format_scalar_definition("int32_t", "Cslstride1", Cslstride1), + format_scalar_definition("int32_t", "Ctlbound0", Ctlbound0), + format_scalar_definition("int32_t", "Ctlstride0", Ctlstride0), + format_scalar_definition("int32_t", "Ctlbound1", Ctlbound1), + format_scalar_definition("int32_t", "Ctlstride1", Ctlstride1), + format_scalar_definition("int32_t", "Ctlbound2", Ctlbound2), + format_scalar_definition("int32_t", "Ctlstride2", Ctlstride2), + format_scalar_definition("int32_t", "Ctlbound3", Ctlbound3), + format_scalar_definition("int32_t", "Ctlstride3", Ctlstride3), + ] + + if kwargs["ifC8HW8datalayout"] is True: + D32slstride0 = 1 * 4 + D32slstride1 = 8 + + # N dim + D32tlbound0 = Cout // 8 + D32tlstride0 = out_height * out_width // 8 * 8 * 8 * 4 + + # M dim + # K is merged because of the block gemm output stationarity + D32tlbound1 = out_width // 8 + D32tlstride1 = 8 * 8 * 4 + + D32tlbound2 = out_height + D32tlstride2 = out_width // 8 * 8 * 8 * 4 + + # Batch dim + D32tlbound3 = Nbatch + D32tlstride3 = Cout * out_height * out_width * 4 + else: + # D32 is int32_t so the stride is 4 times of the int8_t + D32out = Cout + D32slstride0 = 1 * 4 + D32slstride1 = 8 + + # N dim + D32tlbound0 = D32out // 8 + D32tlstride0 = 8 * 4 + + # M dim + # K is merged because of the block gemm output stationarity + D32tlbound1 = out_width // 8 + D32tlstride1 = D32out * 8 * 4 + + D32tlbound2 = out_height + D32tlstride2 = D32out * out_width * 4 + + # Batch dim + D32tlbound3 = Nbatch + D32tlstride3 = D32out * out_height * out_width * 4 + + assert ( + D32tlstride0 % 64 == 0 + and D32tlstride1 % 64 == 0 + and D32tlstride2 % 64 == 0 + and D32tlstride3 % 64 == 0 + ) + + data_str += [ + format_scalar_definition("int32_t", "D32slstride0", D32slstride0), + format_scalar_definition("int32_t", "D32slstride1", D32slstride1), + format_scalar_definition("int32_t", "D32tlbound0", D32tlbound0), + format_scalar_definition("int32_t", "D32tlstride0", D32tlstride0), + format_scalar_definition("int32_t", "D32tlbound1", D32tlbound1), + format_scalar_definition("int32_t", "D32tlstride1", D32tlstride1), + format_scalar_definition("int32_t", "D32tlbound2", D32tlbound2), + format_scalar_definition("int32_t", "D32tlstride2", D32tlstride2), + format_scalar_definition("int32_t", "D32tlbound3", D32tlbound3), + format_scalar_definition("int32_t", "D32tlstride3", D32tlstride3), + ] + + # postprocessing D8 settings + if kwargs["ifC8HW8datalayout"] is True: + D8slstride0 = 1 + D8slstride1 = 8 + + # N dim + D8tlbound0 = Cout // 8 + D8tlstride0 = out_height * out_width // 8 * 8 * 8 + + # M dim + # K is merged because of the block gemm output stationarity + D8tlbound1 = out_width // 8 + D8tlstride1 = 8 * 8 + + D8tlbound2 = out_height + D8tlstride2 = out_width // 8 * 8 * 8 + + # Batch dim + D8tlbound3 = Nbatch + D8tlstride3 = Cout * out_height * out_width + else: + D8out = Cout + D8slstride0 = 1 + D8slstride1 = D8out + + # N dim + D8tlbound0 = D8out // 8 + D8tlstride0 = 8 + + # M dim + # K is merged because of the block gemm output stationarity + D8tlbound1 = out_width // 8 + D8tlstride1 = D8out * 8 + + D8tlbound2 = out_height + D8tlstride2 = D8out * out_width + + # Batch dim + D8tlbound3 = Nbatch + D8tlstride3 = D8out * out_height * out_width + + assert ( + D8tlstride0 % 64 == 0 + and D8tlstride1 % 64 == 0 + and D8tlstride2 % 64 == 0 + and D8tlstride3 % 64 == 0 + ) + data_str += [ + format_scalar_definition("int32_t", "D8slstride0", D8slstride0), + format_scalar_definition("int32_t", "D8slstride1", D8slstride1), + format_scalar_definition("int32_t", "D8tlbound0", D8tlbound0), + format_scalar_definition("int32_t", "D8tlstride0", D8tlstride0), + format_scalar_definition("int32_t", "D8tlbound1", D8tlbound1), + format_scalar_definition("int32_t", "D8tlstride1", D8tlstride1), + format_scalar_definition("int32_t", "D8tlbound2", D8tlbound2), + format_scalar_definition("int32_t", "D8tlstride2", D8tlstride2), + format_scalar_definition("int32_t", "D8tlbound3", D8tlbound3), + format_scalar_definition("int32_t", "D8tlstride3", D8tlstride3), + ] + + # Generating random 8 integer a and b for subtraction + subtraction_a = 0 + subtraction_b = 0 + + # Writing the subtraction value to data.h + data_str += [ + format_scalar_definition("int8_t", "subtraction_a", subtraction_a), + format_scalar_definition("int8_t", "subtraction_b", subtraction_b), + ] + + # direct conv2d + if kwargs["ifC8HW8datalayout"] is True: + direct_conv2d_res = conv2d( + input_data, kernel, stride=stride, padding=padding, mode="C8HW8" + ) + else: + direct_conv2d_res = conv2d( + input_data, kernel, stride=stride, padding=padding, mode="NHWC" + ) + + # output in NHWC format + direct_conv2d_res = np.add(direct_conv2d_res.reshape(-1), bias) + + # Writing testing data and golden data into data.h + # implicit im2col matrix and kernel, store original input data and kernel + data_str += [format_vector_definition("int8_t", "A", input_padding.reshape(-1))] + data_str += [format_vector_definition("int8_t", "B", kernel.reshape(-1))] + data_str += [format_vector_definition("int32_t", "C", bias.reshape(-1))] + + data_str += [format_scalar_definition("int32_t", "transposed_A", 0)] + data_str += [format_scalar_definition("int32_t", "transposed_B", 0)] + + return data_str, direct_conv2d_res + + +def emit_matmul_data(**kwargs): + + meshRow = kwargs["meshRow"] + tileSize = kwargs["tileSize"] + meshCol = kwargs["meshCol"] + + # matmul settings + data_str = [] + + data_str += [format_scalar_definition("int", "Batch", 1)] + data_str += [format_scalar_definition("int", "M", kwargs["M"])] + data_str += [format_scalar_definition("int", "K", kwargs["K"])] + data_str += [format_scalar_definition("int", "N", kwargs["N"])] + + data_str += [format_scalar_definition("int32_t", "Aslstride0", 1)] + data_str += [format_scalar_definition("int32_t", "Aslstride1", bankWidth / 8)] + data_str += [format_scalar_definition("int32_t", "Atlbound0", kwargs["K"])] + data_str += [ + format_scalar_definition( + "int32_t", "Atlstride0", input_data_width * tileSize * meshRow / 8 + ) + ] + data_str += [format_scalar_definition("int32_t", "Atlbound1", kwargs["N"])] + data_str += [format_scalar_definition("int32_t", "Atlstride1", 0)] + data_str += [format_scalar_definition("int32_t", "Atlbound2", kwargs["M"])] + data_str += [ + format_scalar_definition( + "int32_t", + "Atlstride2", + kwargs["K"] * input_data_width * tileSize * meshRow / 8, + ) + ] + data_str += [format_scalar_definition("int32_t", "Atlbound3", 1)] + data_str += [format_scalar_definition("int32_t", "Atlstride3", 0)] + data_str += [format_scalar_definition("int32_t", "Atlbound4", 1)] + data_str += [format_scalar_definition("int32_t", "Atlstride4", 0)] + data_str += [format_scalar_definition("int32_t", "Atlbound5", 1)] + data_str += [format_scalar_definition("int32_t", "Atlstride5", 0)] + + data_str += [format_scalar_definition("int32_t", "Bslstride0", 1)] + data_str += [format_scalar_definition("int32_t", "Bslstride1", bankWidth / 8)] + data_str += [format_scalar_definition("int32_t", "Btlbound0", kwargs["K"])] + data_str += [ + format_scalar_definition( + "int32_t", "Btlstride0", input_data_width * tileSize * meshCol / 8 + ) + ] + data_str += [format_scalar_definition("int32_t", "Btlbound1", kwargs["N"])] + data_str += [ + format_scalar_definition( + "int32_t", + "Btlstride1", + kwargs["K"] * input_data_width * tileSize * meshCol / 8, + ) + ] + data_str += [format_scalar_definition("int32_t", "Btlbound2", kwargs["M"])] + data_str += [format_scalar_definition("int32_t", "Btlstride2", 0)] + + data_str += [format_scalar_definition("int32_t", "Cslstride0", 4)] + data_str += [format_scalar_definition("int32_t", "Cslstride1", bankWidth / 8)] + data_str += [format_scalar_definition("int32_t", "Ctlbound0", kwargs["N"])] + data_str += [ + format_scalar_definition( + "int32_t", "Ctlstride0", output_data_width * meshRow * meshCol / 8 + ) + ] + data_str += [format_scalar_definition("int32_t", "Ctlbound1", kwargs["M"])] + data_str += [ + format_scalar_definition( + "int32_t", + "Ctlstride1", + kwargs["N"] * output_data_width * meshRow * meshCol / 8, + ) + ] + data_str += [format_scalar_definition("int32_t", "Ctlbound2", 1)] + data_str += [format_scalar_definition("int32_t", "Ctlstride2", 0)] + + data_str += [format_scalar_definition("int32_t", "D32slstride0", 4)] + data_str += [format_scalar_definition("int32_t", "D32slstride1", bankWidth / 8)] + data_str += [format_scalar_definition("int32_t", "D32tlbound0", kwargs["N"])] + data_str += [ + format_scalar_definition( + "int32_t", "D32tlstride0", output_data_width * meshRow * meshCol / 8 + ) + ] + data_str += [format_scalar_definition("int32_t", "D32tlbound1", kwargs["M"])] + data_str += [ + format_scalar_definition( + "int32_t", + "D32tlstride1", + kwargs["N"] * output_data_width * meshRow * meshCol / 8, + ) + ] + data_str += [format_scalar_definition("int32_t", "D32tlbound2", 1)] + data_str += [format_scalar_definition("int32_t", "D32tlstride2", 0)] + + data_str += [format_scalar_definition("int32_t", "D8slstride0", 1)] + data_str += [format_scalar_definition("int32_t", "D8slstride1", bankWidth / 8)] + data_str += [format_scalar_definition("int32_t", "D8tlbound0", kwargs["N"])] + data_str += [ + format_scalar_definition( + "int32_t", + "D8tlstride0", + quantized_output_data_width * meshRow * meshCol / 8, + ) + ] + data_str += [format_scalar_definition("int32_t", "D8tlbound1", kwargs["M"])] + data_str += [ + format_scalar_definition( + "int32_t", + "D8tlstride1", + kwargs["N"] * quantized_output_data_width * meshRow * meshCol / 8, + ) + ] + data_str += [format_scalar_definition("int32_t", "D8tlbound2", 1)] + data_str += [format_scalar_definition("int32_t", "D8tlstride2", 0)] + + delta_local_a = 0 + delta_local_b = ( + kwargs["K"] * kwargs["M"] * (meshRow * tileSize * input_data_width / 8) + ) + delta_local_c = delta_local_b + kwargs["K"] * kwargs["N"] * ( + meshCol * tileSize * input_data_width / 8 + ) + delta_local_d32 = delta_local_c + kwargs["M"] * kwargs["N"] * ( + meshRow * meshCol * output_data_width / 8 + ) + delta_local_d8 = delta_local_d32 + data_str += [format_scalar_definition("int32_t", "delta_local_a", delta_local_a)] + data_str += [format_scalar_definition("int32_t", "delta_local_b", delta_local_b)] + data_str += [ + format_scalar_definition( + "int32_t", + "delta_local_c", + delta_local_c, + ) + ] + data_str += [ + format_scalar_definition( + "int32_t", + "delta_local_d32", + delta_local_d32, + ) + ] + data_str += [ + format_scalar_definition( + "int32_t", + "delta_local_d8", + delta_local_d8, + ) + ] + + # Generating random 8 integer a and b for subtraction + subtraction_a = np.random.randint(MIN, MAX) + subtraction_b = np.random.randint(MIN, MAX) + + # Writing the subtraction value to data.h + data_str += [format_scalar_definition("int8_t", "subtraction_a", subtraction_a)] + data_str += [format_scalar_definition("int8_t", "subtraction_b", subtraction_b)] + + A = np.random.randint( + MIN, MAX, size=(kwargs["M"], kwargs["K"], meshRow, tileSize) + ).reshape(-1) + data_str += [format_vector_definition("int8_t", "A", A)] + B = np.random.randint( + MIN, MAX, size=(kwargs["K"], kwargs["N"], tileSize, meshCol) + ).reshape(-1) + data_str += [format_vector_definition("int8_t", "B", B)] + C = np.random.randint( + MIN, MAX, size=(kwargs["M"], kwargs["N"], meshRow, meshCol) + ).reshape(-1) + data_str += [format_vector_definition("int32_t", "C", C)] + + if kwargs["transposed_A"] == 1: + A = A.reshape(kwargs["M"], kwargs["K"], meshRow, tileSize) + A = A.transpose(0, 1, 3, 2).reshape(-1) + if kwargs["transposed_B"] == 1: + B = B.reshape(kwargs["K"], kwargs["N"], tileSize, meshCol) + B = B.transpose(0, 1, 3, 2).reshape(-1) + + data_str += [ + format_scalar_definition("int32_t", "transposed_A", kwargs["transposed_A"]) + ] + data_str += [ + format_scalar_definition("int32_t", "transposed_B", kwargs["transposed_B"]) + ] + + D32 = block_gemm_golden_model( + kwargs["M"], + kwargs["K"], + kwargs["N"], + meshRow, + tileSize, + meshCol, + A, + B, + subtraction_a, + subtraction_b, + C, + ) + + return data_str, D32 + + +def emit_gemmx_data(**kwargs): + + ifTestMatmul = kwargs["ifTestMatmul"] + + if ifTestMatmul == 1: + data_str, D32 = emit_matmul_data(**kwargs) + data_str += ["#define TEST_MATMUL"] + else: + data_str, D32 = emit_conv_data(**kwargs) + + data_str += [format_vector_definition("int32_t", "D32", D32)] + + # ----------------------------------------------------------- + # Postprocessing + # ----------------------------------------------------------- + + bypassSIMD = kwargs["bypassSIMD"] + data_str += [format_scalar_definition("int32_t", "bypassSIMD", bypassSIMD)] + + # Generating random constant values + input_zp_i = np.random.randint(MIN, MAX) + output_zp_i = np.random.randint(MIN, MAX) + shift_i = np.random.randint(0, 63) # values between 0-63 + max_int_i = MAX + min_int_i = MIN + double_round_i = np.random.randint(0, 1) + multiplier_i = np.random.randint(-(2**31), 2**31 - 1) + + # Writing the constant values to data.h + data_str += [ + format_scalar_definition("int8_t", "input_zp_i", input_zp_i), + format_scalar_definition("int8_t", "output_zp_i", output_zp_i), + format_scalar_definition("int8_t", "shift_i", shift_i), + format_scalar_definition("int8_t", "max_int_i", max_int_i), + format_scalar_definition("int8_t", "min_int_i", min_int_i), + format_scalar_definition("int8_t", "double_round_i", double_round_i), + format_scalar_definition("int32_t", "multiplier_i", multiplier_i), + ] + + D8 = postprocessing_simd_golden_model( + D32, + input_zp_i, + output_zp_i, + shift_i, + max_int_i, + min_int_i, + double_round_i, + multiplier_i, + ) + data_str += [format_vector_definition("int8_t", "D8", D8)] + + data_str = "\n\n".join(data_str) + + return data_str + + +def test(): + np.set_printoptions(threshold=np.inf) + + # conv2d settings + Nbatch, H, W, Cin = (1, 24, 24, 40) + Cout, Kh, Kw, Cin = (8, 3, 3, 40) + + stride = (1, 1) + padding = (1, 1) + + # test data generation + input_data = np.random.randint(-10, 10, size=(Nbatch, H, W, Cin)) + kernel = np.random.randint(-10, 10, size=(Cout, Kh, Kw, Cin)) + + im2col_matrix, im2col_kernel = im2col( + input_data, kernel, stride=stride, padding=padding + ) + + M = im2col_matrix.shape[0] // 8 + K = im2col_matrix.shape[1] // 8 + N = im2col_kernel.shape[1] // 8 + + # conv2d using im2col + im2col_matrix, im2col_kernel = im2col( + input_data, kernel, stride=stride, padding=padding + ) + im2col_matrix = data_reshuffler_golden_model( + K, M, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_matrix.reshape(-1) + ) + im2col_kernel = data_reshuffler_golden_model( + K, N, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_kernel.T.reshape(-1) + ) + im2col_conv2d_res = block_gemm_golden_model( + M, K, N, 8, 8, 8, im2col_matrix, im2col_kernel, 0, 0 + ) + + # direct conv2d + direct_conv2d_res = conv2d(input_data, kernel, stride=stride, padding=padding) + direct_conv2d_res = direct_conv2d_res.reshape(-1) + direct_conv2d_res = data_reshuffler_golden_model( + N, M, 8, 8, 8, 8 * 8 * N, 1, 8 * N, direct_conv2d_res, 1 + ) + + # result comparison + assert (im2col_conv2d_res == direct_conv2d_res).all() + + +def main(): + # Parsing cmd args + parser = argparse.ArgumentParser(description="Generate data for kernels") + parser.add_argument( + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help="Select param config file kernel", + ) + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + + # Emit header file + print(emit_header_file(**param)) + + +if __name__ == "__main__": + + # for testing + # test() + + main() diff --git a/target/sim/sw/device/apps/snax/snax-gemmx/data/params.hjson b/target/sim/sw/device/apps/snax/snax-gemmx/data/params.hjson new file mode 100644 index 000000000..c741c7b99 --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-gemmx/data/params.hjson @@ -0,0 +1,12 @@ +{ + K: 2 + N: 2 + M: 3 + bypassSIMD: 1 + ifTestMatmul: 1 + transposed_A: 0 + transposed_B: 0 + meshRow: 8 + meshCol: 8 + tileSize: 8 +} \ No newline at end of file diff --git a/target/sim/sw/device/apps/snax/snax-gemmx/src/snax-gemmx.c b/target/sim/sw/device/apps/snax/snax-gemmx/src/snax-gemmx.c new file mode 100644 index 000000000..293f983da --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-gemmx/src/snax-gemmx.c @@ -0,0 +1,128 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Xiaoling Yi + +#include "data.h" + +#include "snax-gemmx-params.h" + +#include "snax-gemmx-lib.h" + +// This is the main function for the SNAX GEMM for Conv2d +// We use several nested loops to iterate over the input data and weights, +// achieving implicit im2col +int main() { + // Set err value for checking + int err = 0; + + if (snrt_cluster_idx() == 1){ + printf("SNAX GEMM Conv2d: Start\n"); + // Prepare addresses in TCDM + int8_t *local_a, *local_b; + int32_t *local_c, *local_d32; + int8_t *local_d8; + + // Allocate space in TCDM + local_a = (int8_t *)(snrt_l1_next() + delta_local_a); + local_b = (int8_t *)(snrt_l1_next() + delta_local_b); + local_c = (int32_t *)(snrt_l1_next() + delta_local_c); + local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32); + local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8); + + // Transfer data from L3 to L1 + // Using DMA only + int32_t cycle_start = snrt_mcycle(); + int32_t program_start = snrt_mcycle(); + if (snrt_is_dm_core()) { + #ifdef TEST_MATMUL + snrt_dma_start_1d(local_a, A, + M * K * meshRow * tileSize * sizeof(int8_t)); + snrt_dma_start_1d(local_b, B, + N * K * tileSize * meshCol * sizeof(int8_t)); + #else + snrt_dma_start_1d( + local_a, A, + Nbatch * (H + 2 * pad_h) * (W + 2 * pad_w) * Cin * sizeof(int8_t)); + snrt_dma_start_1d(local_b, B, Cout * Kh * Kw * Cin * sizeof(int8_t)); + #endif + snrt_dma_start_1d(local_c, C, + M * N * meshRow * meshCol * sizeof(int32_t)); + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + int32_t cycle_end = snrt_mcycle(); + printf("DMA cycles %d \n", cycle_end - cycle_start); + + if (snrt_global_core_idx() == 0) { + cycle_start = snrt_mcycle(); + // Set Streamer configuration CSR for conv2d + set_gemmx_streamer_csr( + Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1, + Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3, Atlbound4, + Atlstride4, Atlbound5, Atlstride5, + + Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1, + Btlstride1, Btlbound2, Btlstride2, + + D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1, + D8tlstride1, D8tlbound2, D8tlstride2, + + Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1, + Ctlstride1, Ctlbound2, Ctlstride2, + + D32slstride0, D32slstride1, D32tlbound0, D32tlstride0, D32tlbound1, + D32tlstride1, D32tlbound2, D32tlstride2, + + delta_local_a, delta_local_b, delta_local_d8, delta_local_c, + delta_local_d32, bypassSIMD, transposed_A, transposed_B); + + // Set CSR to start Streamer for conv2d + set_gemmx_streamer_start(); + + // Set GEMMX configuration CSR + uint32_t subtraction_setting = + gen_subtraction_config(subtraction_a, subtraction_b); + + uint32_t csr0 = + gen_csr0_config(input_zp_i, output_zp_i, shift_i, max_int_i); + uint32_t csr1 = gen_csr1_config(min_int_i, double_round_i); + uint32_t csr2 = gen_csr2_config(multiplier_i); + + set_gemmx_csr(K, N, M, subtraction_setting, csr0, csr1, csr2, M * N, + bypassSIMD); + + // Set CSR to start GEMM + set_gemmx_start(); + + // Poll until Streamer and GEMM accelerator finish + wait_gemmx_and_streamer(); + + cycle_end = snrt_mcycle(); + + // check the result of the implicit im2col convolution + if (!bypassSIMD) { + err += check_gemmx_result_D8(local_d8, D8, Batch, M, N); + } else { + err += check_gemmx_result_D32(local_d32, D32, Batch, M, N); + } + #ifdef TEST_MATMUL + printf("SNAX GEMM Matmul: %s, Error: %d . bypassSIMD = %d .\n", + err ? "FAIL" : "PASS", err, bypassSIMD); + #else + printf("SNAX GEMM Conv2d: %s, Error: %d . bypassSIMD = %d .\n", + err ? "FAIL" : "PASS", err, bypassSIMD); + #endif + int32_t gemmx_cycles = read_gemmx_perf_counter(); + int32_t gemmx_streamer_cycles = read_gemmx_streamer_perf_counter(); + printf("Workload size: M = %d x N = %d x K = %d\n", M, N, K); + printf("SNAX GEMM cycles: %d\n", gemmx_cycles); + printf("SNAX GEMM Streamer cycles: %d\n", gemmx_streamer_cycles); + printf("SNAX GEMM cycles from cfg to compute finish: %d\n", cycle_end - cycle_start); + printf("SNAX GEMM cycles from DMA to compute finish: %d\n", cycle_end - program_start); + }; + }; + return err; +} diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/Makefile b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/Makefile deleted file mode 100644 index 3205eeba4..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2023 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -APP = snax-streamer-gemm-conv-simd - -INCDIRS = data - -INCDIRS += ../../../snax/streamer-gemm/include -INCDIRS += ../../../snax/streamer-gemm-conv-simd/include -INCDIRS += ../../../snax/gemm/include -INCDIRS += ../../../snax/streamer-simd/include - -# Include this binary in the final build -RISCV_LDFLAGS += ../../../snax/streamer-gemm/build/snax-streamer-gemm-lib.o -RISCV_LDFLAGS += ../../../snax/gemm/build/snax-gemm-lib.o -RISCV_LDFLAGS += ../../../snax/streamer-simd/build/snax-streamer-simd-lib.o -RISCV_LDFLAGS += ../../../snax/streamer-gemm-conv-simd/build/snax-streamer-gemm-conv-simd-lib.o - -SRCS = src/snax-streamer-gemm-conv-simd.c - -include ./data/Makefile -include ../../common.mk - -$(DEP): $(DATA_H) diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/Makefile b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/Makefile deleted file mode 100644 index 18006cbf7..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2023 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -# Usage of absolute paths is required to externally include this Makefile -MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -DATA_DIR := $(realpath $(MK_DIR)) - -DATA_CFG ?= $(DATA_DIR)/params.hjson - -DATA_H = $(DATA_DIR)/data.h - -$(DATA_H): $(DATA_DIR)/datagen.py $(DATA_CFG) - $< -c $(DATA_CFG) > $@ - -.PHONY: clean-data clean - -clean-data: - rm -f $(DATA_H) - -clean: clean-data diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/datagen.py b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/datagen.py deleted file mode 100755 index 9634e05ce..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/datagen.py +++ /dev/null @@ -1,464 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -import numpy as np -import argparse -import pathlib -import hjson -import sys -import os - -# Add data utility path -sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/")) -from data_utils import format_scalar_definition, format_vector_definition # noqa E402 - -# Add golden model path -from snax_utils import ( # noqa E402 - conv2d, - im2col, - block_gemm_golden_model, - data_reshuffler_golden_model, - postprocessing_simd_golden_model, -) # noqa E402 - -np.random.seed(42) - - -# Add stdint.h header -def emit_header_file(**kwargs): - emit_str = "#include \n\n" - emit_str += emit_gemm_data(**kwargs) - return emit_str - - -MIN = -128 -MAX = 127 - - -def emit_gemm_data(**kwargs): - - # conv2d settings - Nbatch, H, W, Cin = (kwargs["Nbatch"], kwargs["H"], kwargs["W"], kwargs["Cin"]) - Cout, Kh, Kw, Cin = (kwargs["Cout"], kwargs["Kh"], kwargs["Kw"], kwargs["Cin"]) - - pad_h, pad_w = (kwargs["stride_h"], kwargs["stride_w"]) - stride_h, stride_w = (kwargs["pad_h"], kwargs["pad_w"]) - - # test data generation - input_data = np.random.randint(MIN, MAX, size=(Nbatch, H, W, Cin)) - kernel = np.random.randint(MIN, MAX, size=(Cout, Kh, Kw, Cin)) - - # inferred config from the input data and kernel - padding = pad_h, pad_w - stride = stride_h, stride_w - - input_padding = np.pad( - input_data, ((0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), mode="constant" - ) - - im2col_matrix, im2col_kernel = im2col( - input_data, kernel, stride=stride, padding=padding - ) - - M = im2col_matrix.shape[0] // 8 - K = im2col_matrix.shape[1] // 8 - N = im2col_kernel.shape[1] // 8 - - length_c = M * N * 8 * 8 - bias = np.random.randint(-(2**30), 2**30 - 1, length_c) - - data_str = [] - - # Generating conv2d settings - data_str += [ - format_scalar_definition("int", "Nbatch", Nbatch), - format_scalar_definition("int", "H", H), - format_scalar_definition("int", "W", W), - format_scalar_definition("int", "Cin", Cin), - format_scalar_definition("int", "Cout", Cout), - format_scalar_definition("int", "Kh", Kh), - format_scalar_definition("int", "Kw", Kw), - format_scalar_definition("int", "stride_h", stride_h), - format_scalar_definition("int", "stride_w", stride_w), - format_scalar_definition("int", "pad_h", pad_h), - format_scalar_definition("int", "pad_w", pad_w), - ] - - # Generating matrix size settings - data_str += [ - format_scalar_definition("int", "Batch", Nbatch), - format_scalar_definition("int", "M", M), - format_scalar_definition("int", "K", K), - format_scalar_definition("int", "N", N), - ] - - # Generating base pointer settings - delta_local_a = 0 - delta_local_b = input_padding.size - delta_local_c = input_padding.size + kernel.size - delta_local_d8 = input_padding.size + kernel.size + length_c * 4 - delta_local_d32 = delta_local_d8 - data_str += [ - format_scalar_definition("int32_t", "delta_local_a", delta_local_a), - format_scalar_definition("int32_t", "delta_local_b", delta_local_b), - format_scalar_definition("int32_t", "delta_local_d8", delta_local_d8), - format_scalar_definition("int32_t", "delta_local_c", delta_local_c), - format_scalar_definition("int32_t", "delta_local_d32", delta_local_d32), - ] - - # for streamer cfg - # streamer setting for data mover A - Aslstride0 = 1 - Aslstride1 = Cin - - # K dim - Atlbound0 = Cin // 8 - Atlstride0 = 8 - - Atlbound1 = Kw - Atlstride1 = Cin * stride_w - - Atlbound2 = Kh - Atlstride2 = Cin * (W + 2 * pad_w) - - # N dim - Atlbound3 = Cout // 8 - Atlstride3 = 0 - - # M dim - Atlbound4 = W // 8 - Atlstride4 = Cin * 8 - - Atlbound5 = H - Atlstride5 = Cin * (W + 2 * pad_w) * stride_h - - # Batch dim - Atlbound6 = Nbatch - Atlstride6 = Cin * (H + 2 * pad_h) * (W + 2 * pad_w) - - data_str += [ - format_scalar_definition("int32_t", "Aslstride0", Aslstride0), - format_scalar_definition("int32_t", "Aslstride1", Aslstride1), - format_scalar_definition("int32_t", "Atlbound0", Atlbound0), - format_scalar_definition("int32_t", "Atlstride0", Atlstride0), - format_scalar_definition("int32_t", "Atlbound1", Atlbound1), - format_scalar_definition("int32_t", "Atlstride1", Atlstride1), - format_scalar_definition("int32_t", "Atlbound2", Atlbound2), - format_scalar_definition("int32_t", "Atlstride2", Atlstride2), - format_scalar_definition("int32_t", "Atlbound3", Atlbound3), - format_scalar_definition("int32_t", "Atlstride3", Atlstride3), - format_scalar_definition("int32_t", "Atlbound4", Atlbound4), - format_scalar_definition("int32_t", "Atlstride4", Atlstride4), - format_scalar_definition("int32_t", "Atlbound5", Atlbound5), - format_scalar_definition("int32_t", "Atlstride5", Atlstride5), - format_scalar_definition("int32_t", "Atlbound6", Atlbound6), - format_scalar_definition("int32_t", "Atlstride6", Atlstride6), - ] - - # streamer setting for data mover B - Bslstride0 = 1 - Bslstride1 = Cin * Kw * Kh - - # K dim - Btlbound0 = Cin * Kw * Kh // 8 - Btlstride0 = 8 - - # N dim - Btlbound1 = Cout // 8 - Btlstride1 = Cin * Kw * Kh * 8 - - # M dim - Btlbound2 = H * W // 8 - Btlstride2 = 0 - - # Batch dim - Btlbound3 = Nbatch - Btlstride3 = 0 - - data_str += [ - format_scalar_definition("int32_t", "Bslstride0", Bslstride0), - format_scalar_definition("int32_t", "Bslstride1", Bslstride1), - format_scalar_definition("int32_t", "Btlbound0", Btlbound0), - format_scalar_definition("int32_t", "Btlstride0", Btlstride0), - format_scalar_definition("int32_t", "Btlbound1", Btlbound1), - format_scalar_definition("int32_t", "Btlstride1", Btlstride1), - format_scalar_definition("int32_t", "Btlbound2", Btlbound2), - format_scalar_definition("int32_t", "Btlstride2", Btlstride2), - format_scalar_definition("int32_t", "Btlbound3", Btlbound3), - format_scalar_definition("int32_t", "Btlstride3", Btlstride3), - ] - - # streamer setting for data mover C - # C is int32_t so the stride is 4 times of the int8_t - Cslstride0 = 1 * 4 - Cslstride1 = Cout * 4 - - # N dim - Ctlbound0 = Cout // 8 - Ctlstride0 = 8 * 4 - - # M dim - # K is merged because of the block gemm output stationarity - Ctlbound1 = W // 8 - Ctlstride1 = Cout * 8 * 4 - - Ctlbound2 = H - Ctlstride2 = Cout * W * 4 - - # Batch dim - Ctlbound3 = Nbatch - Ctlstride3 = Cout * H * W * 4 - - data_str += [ - format_scalar_definition("int32_t", "Cslstride0", Cslstride0), - format_scalar_definition("int32_t", "Cslstride1", Cslstride1), - format_scalar_definition("int32_t", "Ctlbound0", Ctlbound0), - format_scalar_definition("int32_t", "Ctlstride0", Ctlstride0), - format_scalar_definition("int32_t", "Ctlbound1", Ctlbound1), - format_scalar_definition("int32_t", "Ctlstride1", Ctlstride1), - format_scalar_definition("int32_t", "Ctlbound2", Ctlbound2), - format_scalar_definition("int32_t", "Ctlstride2", Ctlstride2), - format_scalar_definition("int32_t", "Ctlbound3", Ctlbound3), - format_scalar_definition("int32_t", "Ctlstride3", Ctlstride3), - ] - - # D32 is int32_t so the stride is 4 times of the int8_t - D32out = Cout - D32slstride0 = 1 * 4 - D32slstride1 = D32out * 4 - - # N dim - D32tlbound0 = D32out // 8 - D32tlstride0 = 8 * 4 - - # M dim - # K is merged because of the block gemm output stationarity - D32tlbound1 = W // 8 - D32tlstride1 = D32out * 8 * 4 - - D32tlbound2 = H - D32tlstride2 = D32out * W * 4 - - # Batch dim - D32tlbound3 = Nbatch - D32tlstride3 = D32out * H * W * 4 - - data_str += [ - format_scalar_definition("int32_t", "D32slstride0", D32slstride0), - format_scalar_definition("int32_t", "D32slstride1", D32slstride1), - format_scalar_definition("int32_t", "D32tlbound0", D32tlbound0), - format_scalar_definition("int32_t", "D32tlstride0", D32tlstride0), - format_scalar_definition("int32_t", "D32tlbound1", D32tlbound1), - format_scalar_definition("int32_t", "D32tlstride1", D32tlstride1), - format_scalar_definition("int32_t", "D32tlbound2", D32tlbound2), - format_scalar_definition("int32_t", "D32tlstride2", D32tlstride2), - format_scalar_definition("int32_t", "D32tlbound3", D32tlbound3), - format_scalar_definition("int32_t", "D32tlstride3", D32tlstride3), - ] - - # postprocessing D8 settings - D8out = Cout - D8slstride0 = 1 - D8slstride1 = D8out - - # N dim - D8tlbound0 = D8out // 8 - D8tlstride0 = 8 - - # M dim - # K is merged because of the block gemm output stationarity - D8tlbound1 = W // 8 - D8tlstride1 = D8out * 8 - - D8tlbound2 = H - D8tlstride2 = D8out * W - - # Batch dim - D8tlbound3 = Nbatch - D8tlstride3 = D8out * H * W - - data_str += [ - format_scalar_definition("int32_t", "D8slstride0", D8slstride0), - format_scalar_definition("int32_t", "D8slstride1", D8slstride1), - format_scalar_definition("int32_t", "D8tlbound0", D8tlbound0), - format_scalar_definition("int32_t", "D8tlstride0", D8tlstride0), - format_scalar_definition("int32_t", "D8tlbound1", D8tlbound1), - format_scalar_definition("int32_t", "D8tlstride1", D8tlstride1), - format_scalar_definition("int32_t", "D8tlbound2", D8tlbound2), - format_scalar_definition("int32_t", "D8tlstride2", D8tlstride2), - format_scalar_definition("int32_t", "D8tlbound3", D8tlbound3), - format_scalar_definition("int32_t", "D8tlstride3", D8tlstride3), - ] - - # Generating random 8 integer a and b for subtraction - subtraction_a = 0 - subtraction_b = 0 - - # Writing the subtraction value to data.h - data_str += [ - format_scalar_definition("int8_t", "subtraction_a", subtraction_a), - format_scalar_definition("int8_t", "subtraction_b", subtraction_b), - ] - - # conv2d using im2col - im2col_matrix = data_reshuffler_golden_model( - K, M, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_matrix.reshape(-1) - ) - # row major to column major for B - im2col_kernel = im2col_kernel.T - im2col_kernel = data_reshuffler_golden_model( - K, N, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_kernel.reshape(-1) - ) - # im2col_conv2d_res = block_gemm_golden_model( - # M, K, N, 8, 8, 8, im2col_matrix, im2col_kernel, 0, 0, bias - # ) - # im2col_conv2d_res = data_reshuffler_golden_model(N, M, 8, 8, 8, 8 * 8 * N, - # 1, 8 * N, im2col_conv2d_res, 1) - - # direct conv2d - direct_conv2d_res = conv2d(input_data, kernel, stride=stride, padding=padding) - # output in NHWC format - direct_conv2d_res = direct_conv2d_res.reshape(-1) - - # Writing testing data and golden data into data.h - # implicit im2col matrix and kernel, store original input data and kernel - data_str += [format_vector_definition("int8_t", "A", input_padding.reshape(-1))] - data_str += [format_vector_definition("int8_t", "B", kernel.reshape(-1))] - data_str += [format_vector_definition("int32_t", "C", bias.reshape(-1))] - - # explicit im2col matrix and kernel, store the columned input data - # for comparing with the implicit im2col method - # data_str += [format_vector_definition("int8_t", "A", im2col_matrix)] - # data_str += [format_vector_definition("int8_t", "B", im2col_kernel)] - - # ----------------------------------------------------------- - # Postprocessing - # ----------------------------------------------------------- - - # Generating random constant values - input_zp_i = np.random.randint(MIN, MAX) - output_zp_i = np.random.randint(MIN, MAX) - shift_i = np.random.randint(0, 63) # values between 0-63 - max_int_i = MAX - min_int_i = MIN - double_round_i = np.random.randint(0, 1) - multiplier_i = np.random.randint(-(2**31), 2**31 - 1) - - # Writing the constant values to data.h - data_str += [ - format_scalar_definition("int8_t", "input_zp_i", input_zp_i), - format_scalar_definition("int8_t", "output_zp_i", output_zp_i), - format_scalar_definition("int8_t", "shift_i", shift_i), - format_scalar_definition("int8_t", "max_int_i", max_int_i), - format_scalar_definition("int8_t", "min_int_i", min_int_i), - format_scalar_definition("int8_t", "double_round_i", double_round_i), - format_scalar_definition("int32_t", "multiplier_i", multiplier_i), - ] - - bypassSIMD = kwargs["bypassSIMD"] - data_str += [format_scalar_definition("int32_t", "bypassSIMD", bypassSIMD)] - - data_str += [ - format_vector_definition( - "int32_t", "D32_direct_conv2d", np.add(direct_conv2d_res, bias) - ) - ] - - if bypassSIMD == 0: - direct_conv2d_res = postprocessing_simd_golden_model( - np.add(direct_conv2d_res, bias), - input_zp_i, - output_zp_i, - shift_i, - max_int_i, - min_int_i, - double_round_i, - multiplier_i, - ) - data_str += [ - format_vector_definition("int8_t", "D8_direct_conv2d", direct_conv2d_res) - ] - - data_str = "\n\n".join(data_str) - - return data_str - - -def test(): - np.set_printoptions(threshold=np.inf) - - # conv2d settings - Nbatch, H, W, Cin = (1, 24, 24, 40) - Cout, Kh, Kw, Cin = (8, 3, 3, 40) - - stride = (1, 1) - padding = (1, 1) - - # test data generation - input_data = np.random.randint(-10, 10, size=(Nbatch, H, W, Cin)) - kernel = np.random.randint(-10, 10, size=(Cout, Kh, Kw, Cin)) - - im2col_matrix, im2col_kernel = im2col( - input_data, kernel, stride=stride, padding=padding - ) - - M = im2col_matrix.shape[0] // 8 - K = im2col_matrix.shape[1] // 8 - N = im2col_kernel.shape[1] // 8 - - # conv2d using im2col - im2col_matrix, im2col_kernel = im2col( - input_data, kernel, stride=stride, padding=padding - ) - im2col_matrix = data_reshuffler_golden_model( - K, M, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_matrix.reshape(-1) - ) - im2col_kernel = data_reshuffler_golden_model( - K, N, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_kernel.T.reshape(-1) - ) - im2col_conv2d_res = block_gemm_golden_model( - M, K, N, 8, 8, 8, im2col_matrix, im2col_kernel, 0, 0 - ) - - # direct conv2d - direct_conv2d_res = conv2d(input_data, kernel, stride=stride, padding=padding) - direct_conv2d_res = direct_conv2d_res.reshape(-1) - direct_conv2d_res = data_reshuffler_golden_model( - N, M, 8, 8, 8, 8 * 8 * N, 1, 8 * N, direct_conv2d_res, 1 - ) - - # result comparison - assert (im2col_conv2d_res == direct_conv2d_res).all() - - -def main(): - # Parsing cmd args - parser = argparse.ArgumentParser(description="Generate data for kernels") - parser.add_argument( - "-c", - "--cfg", - type=pathlib.Path, - required=True, - help="Select param config file kernel", - ) - args = parser.parse_args() - - # Load param config file - with args.cfg.open() as f: - param = hjson.loads(f.read()) - - # Emit header file - print(emit_header_file(**param)) - - -if __name__ == "__main__": - - # for testing - # test() - - main() diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/params.hjson b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/params.hjson deleted file mode 100644 index d90d9f494..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/data/params.hjson +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Xiaoling Yi - -{ - Nbatch: 1, - H: 16, - W: 16, - Cin: 16, - Cout: 16, - Kh: 3, - Kw: 3, - pad_h: 1, - pad_w: 1, - stride_h: 1, - stride_w: 1, - bypassSIMD: 0, -} diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/src/snax-streamer-gemm-conv-simd.c b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/src/snax-streamer-gemm-conv-simd.c deleted file mode 100644 index 48c54d139..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv-simd/src/snax-streamer-gemm-conv-simd.c +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include "data.h" - -#include "snax-gemm-lib.h" -#include "snax-gemm-params.h" - -#include "snax-streamer-gemm-lib.h" -#include "snax-streamer-simd-lib.h" - -#include "snax-streamer-gemm-conv-simd-lib.h" - -// This is the main function for the SNAX GEMM for Conv2d -// We use several nested loops to iterate over the input data and weights, -// achieving implicit im2col -int main() { - // Set err value for checking - int err = 0; - - // Prepare addresses in TCDM - int8_t *local_a, *local_b; - int32_t *local_c, *local_d32; - int8_t *local_d8; - - // Allocate space in TCDM - local_a = (int8_t *)(snrt_l1_next() + delta_local_a); - local_b = (int8_t *)(snrt_l1_next() + delta_local_b); - local_c = (int32_t *)(snrt_l1_next() + delta_local_c); - local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32); - local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8); - - // Transfer data from L3 to L1 - // Using DMA only - if(snrt_cluster_idx() == 0){ - - if (snrt_is_dm_core()) { - load_conv_input_data(Nbatch, H + 2 * pad_h, W + 2 * pad_w, Cin, local_a, - A); - load_weight_data(Cout, Kh, Kw, Cin, local_b, B); - } - - // Wait for DMA to finish - snrt_cluster_hw_barrier(); - - if (snrt_is_dm_core()) { - snrt_dma_start_1d(local_c, C, - M * N * meshRow * meshCol * sizeof(int32_t)); - } - - snrt_cluster_hw_barrier(); - - if (snrt_global_core_idx() == 0) { - // Set Streamer configuration CSR for conv2d - set_gemmx_streamer_csr( - Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1, - Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3, Atlbound4, - Atlstride4, Atlbound5, Atlstride5, - - Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1, - Btlstride1, Btlbound2, Btlstride2, - - D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1, - D8tlstride1, D8tlbound2, D8tlstride2, - - Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1, - Ctlstride1, Ctlbound2, Ctlstride2, - - D32slstride0, D32slstride1, D32tlbound0, D32tlstride0, D32tlbound1, - D32tlstride1, D32tlbound2, D32tlstride2, - - delta_local_a, delta_local_b, delta_local_d8, delta_local_c, - delta_local_d32, bypassSIMD); - - // Set CSR to start Streamer for conv2d - set_gemmx_streamer_start(); - - // Set GEMM configuration CSR - uint32_t subtraction_setting = - gen_subtraction_config(subtraction_a, subtraction_b); - - uint32_t csr0 = - gen_csr0_config(input_zp_i, output_zp_i, shift_i, max_int_i); - uint32_t csr1 = gen_csr1_config(min_int_i, double_round_i); - uint32_t csr2 = gen_csr2_config(multiplier_i); - - set_gemmx_csr(K, N, M, subtraction_setting, csr0, csr1, csr2, M * N, - bypassSIMD); - - // Set CSR to start GEMM - set_gemmx_start(); - - // Poll until Streamer and GEMM accelerator finish - wait_gemmx_and_streamer(); - - // check the result of the implicit im2col convolution - if (!bypassSIMD) { - err += - check_gemmx_result_D8(local_d8, D8_direct_conv2d, Batch, M, N); - } else { - err += check_gemmx_result_D32(local_d32, D32_direct_conv2d, Batch, - M, N); - } - printf("SNAX GEMM Conv2d: %s, err = %d . bypassSIMD = %d .\n", - err ? "FAIL" : "PASS", err, bypassSIMD); - }; - - }; - - return err; -} diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/Makefile b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/Makefile deleted file mode 100755 index 252b0ce18..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2023 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -APP = snax-streamer-gemm-conv - -INCDIRS = data - -INCDIRS += ../../../snax/streamer-gemm/include -INCDIRS += ../../../snax/streamer-gemm-conv/include -INCDIRS += ../../../snax/gemm/include - -# Include this binary in the final build -RISCV_LDFLAGS += ../../../snax/streamer-gemm/build/snax-streamer-gemm-lib.o -RISCV_LDFLAGS += ../../../snax/streamer-gemm-conv/build/snax-streamer-gemm-conv-lib.o -RISCV_LDFLAGS += ../../../snax/gemm/build/snax-gemm-lib.o - -SRCS = src/snax-streamer-gemm-conv.c - -include ./data/Makefile -include ../../common.mk - -$(DEP): $(DATA_H) diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/Makefile b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/Makefile deleted file mode 100755 index 18006cbf7..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2023 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -# Usage of absolute paths is required to externally include this Makefile -MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -DATA_DIR := $(realpath $(MK_DIR)) - -DATA_CFG ?= $(DATA_DIR)/params.hjson - -DATA_H = $(DATA_DIR)/data.h - -$(DATA_H): $(DATA_DIR)/datagen.py $(DATA_CFG) - $< -c $(DATA_CFG) > $@ - -.PHONY: clean-data clean - -clean-data: - rm -f $(DATA_H) - -clean: clean-data diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/datagen.py b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/datagen.py deleted file mode 100755 index 38fb44069..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/datagen.py +++ /dev/null @@ -1,514 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -import numpy as np -import argparse -import pathlib -import hjson -import sys -import os - -# Add data utility path -sys.path.append( - os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/") -) -from data_utils import format_scalar_definition, format_vector_definition # noqa E402 - -np.random.seed(42) - - -def conv2d(input_data, kernel, stride=(1, 1), padding=(0, 0)): - batch_size, in_height, in_width, in_channels = input_data.shape - out_channels, kernel_height, kernel_width, _ = kernel.shape - stride_h, stride_w = stride - pad_h, pad_w = padding - - # Calculate the output feature map dimensions - out_height = (in_height - kernel_height + 2 * pad_h) // stride_h + 1 - out_width = (in_width - kernel_width + 2 * pad_w) // stride_w + 1 - - # Add padding - input_data_padded = np.pad( - input_data, ((0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), mode="constant" - ) - - # Initialize the output feature map - output_data = np.zeros((batch_size, out_height, out_width, out_channels), np.int32) - - # Perform the convolution operation - for b in range(batch_size): - for oc in range(out_channels): - for oh in range(out_height): - for ow in range(out_width): - # Calculate the input region - ih_start = oh * stride_h - ih_end = ih_start + kernel_height - iw_start = ow * stride_w - iw_end = iw_start + kernel_width - - # Slice to extract the input region - input_region = input_data_padded[ - b, ih_start:ih_end, iw_start:iw_end, : - ] - - # Slice to extract the corresponding convolution kernel - conv_kernel = kernel[oc, :, :, :] - - # Perform the convolution calculation - output_data[b, oh, ow, oc] = np.sum(input_region * conv_kernel) - - return output_data - - -def im2col(input_data, kernel, stride=(1, 1), padding=(0, 0)): - batch_size, in_height, in_width, in_channels = input_data.shape - out_channels, kernel_height, kernel_width, _ = kernel.shape - stride_h, stride_w = stride - pad_h, pad_w = padding - - # Calculate the size of the output feature map - out_height = (in_height + 2 * pad_h - kernel_height) // stride_h + 1 - out_width = (in_width + 2 * pad_w - kernel_width) // stride_w + 1 - - # Apply zero padding to the input data - input_data_padded = np.pad( - input_data, ((0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), mode="constant" - ) - - # Initialize the im2col matrix - im2col_matrix = np.zeros( - (batch_size, out_height * out_width, in_channels * kernel_height * kernel_width) - ) - - # Perform the im2col transformation on the input data - for b in range(batch_size): - for oh in range(out_height): - for ow in range(out_width): - # Calculate the input region - ih_start = oh * stride_h - ih_end = ih_start + kernel_height - iw_start = ow * stride_w - iw_end = iw_start + kernel_width - - # Slice and extract the input region - input_region = input_data_padded[b, ih_start:ih_end, iw_start:iw_end, :] - - # Flatten the input region into a 1D vector and add it to the - # corresponding position in the im2col matrix - im2col_matrix[b, oh * out_width + ow, :] = input_region.reshape(-1) - - im2col_matrix = im2col_matrix.reshape(batch_size * out_height * out_width, -1) - im2col_kernel = kernel.reshape(out_channels, -1).T - - return im2col_matrix, im2col_kernel - - -# Golden model in python -def block_gemm_golden_model( - m, k, n, row, size, col, a, b, subtraction_a, subtraction_b -): - c = np.zeros(m * row * n * col, dtype=(np.int32)) - for mm in range(m): - for nn in range(n): - for kk in range(k): - for rr in range(row): - for cc in range(col): - for ss in range(size): - c_index = ( - mm * n * row * col + nn * row * col + rr * col + cc - ) - a_index = ( - mm * k * row * size + kk * row * size + rr * size + ss - ) - b_index = ( - nn * k * size * col + kk * size * col + cc * size + ss - ) - c[c_index] = c[c_index] + (a[a_index] - subtraction_a) * ( - b[b_index] - subtraction_b - ) - return c - - -def data_reshuffler_golden_model( - tempLoop0, - tempLoop1, - spatial_len_0, - spatial_len_1, - tempStride0, - tempStride1, - spatialStride0, - spatialStride1, - data, - int32=False, -): - # abstract illusion: k innermost loop, m second innermost loop, - # K third innermost loop, M outermost loop - - # total loop bounds = spatial loop bounds * temporal loop bounds - K = tempLoop0 * spatial_len_0 - M = tempLoop1 * spatial_len_1 - - # loop bounds settings - matrix_size = {"K": K, "M": M, "k": spatial_len_0, "m": spatial_len_1} - - # stride settings - strides = { - "M": tempStride1, - "K": tempStride0, - "m": spatialStride1, - "k": spatialStride0, - } - - if int32: - result_array = np.zeros((matrix_size["M"] * matrix_size["K"]), np.int32) - else: - result_array = np.zeros((matrix_size["M"] * matrix_size["K"]), np.int8) - - # apply strided layout mapping for the golden model of data reshuffler - for M in range(matrix_size["M"] // matrix_size["m"]): - for K in range(matrix_size["K"] // matrix_size["k"]): - for m in range(matrix_size["m"]): - for k in range(matrix_size["k"]): - result_array[ - # output address calculation with coutinued increment - matrix_size["K"] - // matrix_size["k"] - * matrix_size["k"] - * matrix_size["m"] - * M - + matrix_size["k"] * matrix_size["m"] * K - + m * matrix_size["k"] - + k - ] = data[ - # input address calculation with - # strided layout mapping eqaution - strides["M"] * M - + strides["K"] * K - + strides["m"] * m - + strides["k"] * k - ] - - return result_array.ravel() - - -# Add stdint.h header -def emit_header_file(**kwargs): - emit_str = "#include \n\n" - emit_str += emit_gemm_data(**kwargs) - return emit_str - - -MIN = -128 -MAX = 127 - - -def emit_gemm_data(**kwargs): - - # conv2d settings - Nbatch, H, W, Cin = (kwargs["Nbatch"], kwargs["H"], kwargs["W"], kwargs["Cin"]) - Cout, Kh, Kw, Cin = (kwargs["Cout"], kwargs["Kh"], kwargs["Kw"], kwargs["Cin"]) - - pad_h, pad_w = (kwargs["stride_h"], kwargs["stride_w"]) - stride_h, stride_w = (kwargs["pad_h"], kwargs["pad_w"]) - - # test data generation - input_data = np.random.randint(-10, 10, size=(Nbatch, H, W, Cin)) - kernel = np.random.randint(-10, 10, size=(Cout, Kh, Kw, Cin)) - - # inferred config from the input data and kernel - padding = pad_h, pad_w - stride = stride_h, stride_w - - input_padding = np.pad( - input_data, ((0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), mode="constant" - ) - - im2col_matrix, im2col_kernel = im2col( - input_data, kernel, stride=stride, padding=padding - ) - - M = im2col_matrix.shape[0] // 8 - K = im2col_matrix.shape[1] // 8 - N = im2col_kernel.shape[1] // 8 - - data_str = [] - - # Generating conv2d settings - data_str += [ - format_scalar_definition("int", "Nbatch", Nbatch), - format_scalar_definition("int", "H", H), - format_scalar_definition("int", "W", W), - format_scalar_definition("int", "Cin", Cin), - format_scalar_definition("int", "Cout", Cout), - format_scalar_definition("int", "Kh", Kh), - format_scalar_definition("int", "Kw", Kw), - format_scalar_definition("int", "stride_h", stride_h), - format_scalar_definition("int", "stride_w", stride_w), - format_scalar_definition("int", "pad_h", pad_h), - format_scalar_definition("int", "pad_w", pad_w), - ] - - # Generating matrix size settings - data_str += [ - format_scalar_definition("int", "Batch", Nbatch), - format_scalar_definition("int", "M", M), - format_scalar_definition("int", "K", K), - format_scalar_definition("int", "N", N), - ] - - # Generating base pointer settings - delta_local_a = 0 - delta_local_b = input_padding.size - delta_local_c = input_padding.size + kernel.size - data_str += [ - format_scalar_definition("int32_t", "delta_local_a", delta_local_a), - format_scalar_definition("int32_t", "delta_local_b", delta_local_b), - format_scalar_definition("int32_t", "delta_local_c", delta_local_c), - ] - - # for streamer cfg - # streamer setting for data mover A - Aslstride0 = 1 - Aslstride1 = Cin - - # K dim - Atlbound0 = Cin // 8 - Atlstride0 = 8 - - Atlbound1 = Kw - Atlstride1 = Cin * stride_w - - Atlbound2 = Kh - Atlstride2 = Cin * (W + 2 * pad_w) - - # N dim - Atlbound3 = Cout // 8 - Atlstride3 = 0 - - # M dim - Atlbound4 = W // 8 - Atlstride4 = Cin * 8 - - Atlbound5 = H - Atlstride5 = Cin * (W + 2 * pad_w) * stride_h - - # Batch dim - Atlbound6 = Nbatch - Atlstride6 = Cin * (H + 2 * pad_h) * (W + 2 * pad_w) - - data_str += [ - format_scalar_definition("int32_t", "Aslstride0", Aslstride0), - format_scalar_definition("int32_t", "Aslstride1", Aslstride1), - format_scalar_definition("int32_t", "Atlbound0", Atlbound0), - format_scalar_definition("int32_t", "Atlstride0", Atlstride0), - format_scalar_definition("int32_t", "Atlbound1", Atlbound1), - format_scalar_definition("int32_t", "Atlstride1", Atlstride1), - format_scalar_definition("int32_t", "Atlbound2", Atlbound2), - format_scalar_definition("int32_t", "Atlstride2", Atlstride2), - format_scalar_definition("int32_t", "Atlbound3", Atlbound3), - format_scalar_definition("int32_t", "Atlstride3", Atlstride3), - format_scalar_definition("int32_t", "Atlbound4", Atlbound4), - format_scalar_definition("int32_t", "Atlstride4", Atlstride4), - format_scalar_definition("int32_t", "Atlbound5", Atlbound5), - format_scalar_definition("int32_t", "Atlstride5", Atlstride5), - format_scalar_definition("int32_t", "Atlbound6", Atlbound6), - format_scalar_definition("int32_t", "Atlstride6", Atlstride6), - ] - - # streamer setting for data mover B - Bslstride0 = 1 - Bslstride1 = Cin * Kw * Kh - - # K dim - Btlbound0 = Cin * Kw * Kh // 8 - Btlstride0 = 8 - - # N dim - Btlbound1 = Cout // 8 - Btlstride1 = Cin * Kw * Kh * 8 - - # M dim - Btlbound2 = H * W // 8 - Btlstride2 = 0 - - # Batch dim - Btlbound3 = Nbatch - Btlstride3 = 0 - - data_str += [ - format_scalar_definition("int32_t", "Bslstride0", Bslstride0), - format_scalar_definition("int32_t", "Bslstride1", Bslstride1), - format_scalar_definition("int32_t", "Btlbound0", Btlbound0), - format_scalar_definition("int32_t", "Btlstride0", Btlstride0), - format_scalar_definition("int32_t", "Btlbound1", Btlbound1), - format_scalar_definition("int32_t", "Btlstride1", Btlstride1), - format_scalar_definition("int32_t", "Btlbound2", Btlbound2), - format_scalar_definition("int32_t", "Btlstride2", Btlstride2), - format_scalar_definition("int32_t", "Btlbound3", Btlbound3), - format_scalar_definition("int32_t", "Btlstride3", Btlstride3), - ] - - # streamer setting for data mover C - # C is int32_t so the stride is 4 times of the int8_t - Cslstride0 = 4 - Cslstride1 = Cout * 4 - - # N dim - Ctlbound0 = Cout // 8 - Ctlstride0 = 8 * 4 - - # M dim - # K is merged because of the block gemm output stationarity - Ctlbound1 = W // 8 - Ctlstride1 = Cout * 8 * 4 - - Ctlbound2 = H - Ctlstride2 = Cout * W * 4 - - # Batch dim - Ctlbound3 = Nbatch - Ctlstride3 = Cout * H * W * 4 - - data_str += [ - format_scalar_definition("int32_t", "Cslstride0", Cslstride0), - format_scalar_definition("int32_t", "Cslstride1", Cslstride1), - format_scalar_definition("int32_t", "Ctlbound0", Ctlbound0), - format_scalar_definition("int32_t", "Ctlstride0", Ctlstride0), - format_scalar_definition("int32_t", "Ctlbound1", Ctlbound1), - format_scalar_definition("int32_t", "Ctlstride1", Ctlstride1), - format_scalar_definition("int32_t", "Ctlbound2", Ctlbound2), - format_scalar_definition("int32_t", "Ctlstride2", Ctlstride2), - format_scalar_definition("int32_t", "Ctlbound3", Ctlbound3), - format_scalar_definition("int32_t", "Ctlstride3", Ctlstride3), - ] - - # Generating random 8 integer a and b for subtraction - subtraction_a = 0 - subtraction_b = 0 - - # Writing the subtraction value to data.h - data_str += [ - format_scalar_definition("int8_t", "subtraction_a", subtraction_a), - format_scalar_definition("int8_t", "subtraction_b", subtraction_b), - ] - - # conv2d using im2col - im2col_matrix = data_reshuffler_golden_model( - K, M, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_matrix.reshape(-1) - ) - # row major to column major for B - im2col_kernel = im2col_kernel.T - im2col_kernel = data_reshuffler_golden_model( - K, N, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_kernel.reshape(-1) - ) - im2col_conv2d_res = block_gemm_golden_model( - M, K, N, 8, 8, 8, im2col_matrix, im2col_kernel, 0, 0 - ) - # im2col_conv2d_res = data_reshuffler_golden_model(N, M, 8, 8, 8, 8 * 8 * N, - # 1, 8 * N, im2col_conv2d_res, 1) - - # direct conv2d - direct_conv2d_res = conv2d(input_data, kernel, stride=stride, padding=padding) - # output in NHWC format - direct_conv2d_res = direct_conv2d_res.reshape(-1) - - # Writing testing data and golden data into data.h - # implicit im2col matrix and kernel, store original input data and kernel - data_str += [format_vector_definition("int8_t", "A", input_padding.reshape(-1))] - data_str += [format_vector_definition("int8_t", "B", kernel.reshape(-1))] - - # explicit im2col matrix and kernel, store the columned input data - # for comparing with the implicit im2col method - # data_str += [format_vector_definition("int8_t", "A", im2col_matrix)] - # data_str += [format_vector_definition("int8_t", "B", im2col_kernel)] - - data_str += [ - format_vector_definition("int32_t", "C_gemm_golden", im2col_conv2d_res) - ] - data_str += [ - format_vector_definition("int32_t", "C_direct_conv2d", direct_conv2d_res) - ] - - data_str = "\n\n".join(data_str) - - return data_str - - -def test(): - np.set_printoptions(threshold=np.inf) - - # conv2d settings - Nbatch, H, W, Cin = (1, 24, 24, 40) - Cout, Kh, Kw, Cin = (8, 3, 3, 40) - - stride = (1, 1) - padding = (1, 1) - - # test data generation - input_data = np.random.randint(-10, 10, size=(Nbatch, H, W, Cin)) - kernel = np.random.randint(-10, 10, size=(Cout, Kh, Kw, Cin)) - - im2col_matrix, im2col_kernel = im2col( - input_data, kernel, stride=stride, padding=padding - ) - - M = im2col_matrix.shape[0] // 8 - K = im2col_matrix.shape[1] // 8 - N = im2col_kernel.shape[1] // 8 - - # conv2d using im2col - im2col_matrix, im2col_kernel = im2col( - input_data, kernel, stride=stride, padding=padding - ) - im2col_matrix = data_reshuffler_golden_model( - K, M, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_matrix.reshape(-1) - ) - im2col_kernel = data_reshuffler_golden_model( - K, N, 8, 8, 8, 8 * 8 * K, 1, 8 * K, im2col_kernel.T.reshape(-1) - ) - im2col_conv2d_res = block_gemm_golden_model( - M, K, N, 8, 8, 8, im2col_matrix, im2col_kernel, 0, 0 - ) - - # direct conv2d - direct_conv2d_res = conv2d(input_data, kernel, stride=stride, padding=padding) - direct_conv2d_res = direct_conv2d_res.reshape(-1) - direct_conv2d_res = data_reshuffler_golden_model( - N, M, 8, 8, 8, 8 * 8 * N, 1, 8 * N, direct_conv2d_res, 1 - ) - - # result comparison - assert (im2col_conv2d_res == direct_conv2d_res).all() - - -def main(): - # Parsing cmd args - parser = argparse.ArgumentParser(description="Generate data for kernels") - parser.add_argument( - "-c", - "--cfg", - type=pathlib.Path, - required=True, - help="Select param config file kernel", - ) - args = parser.parse_args() - - # Load param config file - with args.cfg.open() as f: - param = hjson.loads(f.read()) - - # Emit header file - print(emit_header_file(**param)) - - -if __name__ == "__main__": - - # for testing - # test() - - main() diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/params.hjson b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/params.hjson deleted file mode 100755 index 738fc9d43..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/data/params.hjson +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Xiaoling Yi - -{ - Nbatch: 1, - H: 16, - W: 16, - Cin: 16, - Cout: 16, - Kh: 3, - Kw: 3, - pad_h: 1, - pad_w: 1, - stride_h: 1, - stride_w: 1, -} diff --git a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/src/snax-streamer-gemm-conv.c b/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/src/snax-streamer-gemm-conv.c deleted file mode 100755 index 37ba420cb..000000000 --- a/target/sim/sw/device/apps/snax/snax-streamer-gemm-conv/src/snax-streamer-gemm-conv.c +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include "data.h" - -#include "snax-gemm-lib.h" - -#include "snax-streamer-gemm-lib.h" - -#include "snax-streamer-gemm-conv-lib.h" - -#include "uart.h" - -// This is the main function for the SNAX GEMM for Conv2d -// We use several nested loops to iterate over the input data and weights, -// achieving implicit im2col -int main() { - // Set err value for checking - int err = 0; - uint32_t group0_bound_lower = 0; - uint32_t group0_bound_upper = snrt_cluster_core_num(); // 2 - // only test one cluster - if(snrt_global_core_idx() - -# Usage of absolute paths is required to externally include -# this Makefile from multiple different locations - -MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -include $(MK_DIR)/../common.mk - -############ -## Outputs # -############ - -OBJS = $(BUILDDIR)/snax-data-reshuffler-lib.o -ALL_OUTPUTS = $(OBJS) - -INCDIRS += $(abspath include) -########## -## Rules # -########## - -.PHONY: all -all: $(ALL_OUTPUTS) - -.PHONY: clean -clean: - rm -rf $(BUILDDIR) - - diff --git a/target/sim/sw/device/snax/data-reshuffler/include/snax-data-reshuffler-lib.h b/target/sim/sw/device/snax/data-reshuffler/include/snax-data-reshuffler-lib.h deleted file mode 100644 index 4d4cba37f..000000000 --- a/target/sim/sw/device/snax/data-reshuffler/include/snax-data-reshuffler-lib.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include -#include "snrt.h" -#include "stdint.h" - -#pragma once - -#define spatial_len_0 8 -#define spatial_len_1 8 -#define spatial_len (spatial_len_0 * spatial_len_1) - -// Set STREAMER configuration CSR -void set_data_reshuffler_csr(int tempLoop0_in, int tempLoop1_in, - int tempLoop2_in, int tempLoop3_in, - int tempLoop4_in, int tempStride0_in, - int tempStride1_in, int tempStride2_in, - int tempStride3_in, int tempStride4_in, - int spatialStride1_in, int tempLoop0_out, - int tempLoop1_out, int tempLoop2_out, - int tempStride0_out, int tempStride1_out, - int tempStride2_out, int spatialStride1_out, - int32_t delta_local_in, int32_t delta_local_out); - -// Set CSR to start STREAMER -void start_streamer(); - -void wait_streamer(); - -void set_data_reshuffler(int T2Len, int reduceLen, int opcode); - -void start_data_reshuffler(); - -void wait_data_reshuffler(); - -uint32_t read_data_reshuffler_perf_counter(); - -void load_data_reshuffler_test_data(int tempLoop0, int tempLoop1, - int tempStride0, int tempStride1, - int spatialStride1, int8_t* base_ptr_local, - int8_t* base_ptr_l2); - -uint32_t check_data_reshuffler_result(int tempLoop0, int tempLoop1, - int tempStride0, int tempStride1, - int spatialStride1, - int8_t* base_ptr_local, - int8_t* base_ptr_l2); - -void load_a_chrunk_of_data(int8_t* base_ptr_local, int8_t* base_ptr_l2, - int len); - -uint32_t test_a_chrunk_of_data(int8_t* base_ptr_local, int8_t* base_ptr_l2, - int len); diff --git a/target/sim/sw/device/snax/data-reshuffler/src/snax-data-reshuffler-lib.c b/target/sim/sw/device/snax/data-reshuffler/src/snax-data-reshuffler-lib.c deleted file mode 100644 index 06cc83df7..000000000 --- a/target/sim/sw/device/snax/data-reshuffler/src/snax-data-reshuffler-lib.c +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include "snax-data-reshuffler-lib.h" - -// Set STREAMER configuration CSR -void set_data_reshuffler_csr(int tempLoop0_in, int tempLoop1_in, - int tempLoop2_in, int tempLoop3_in, - int tempLoop4_in, int tempStride0_in, - int tempStride1_in, int tempStride2_in, - int tempStride3_in, int tempStride4_in, - int spatialStride1_in, int tempLoop0_out, - int tempLoop1_out, int tempLoop2_out, - int tempStride0_out, int tempStride1_out, - int tempStride2_out, int spatialStride1_out, - int32_t delta_local_in, int32_t delta_local_out) { - // temporal loop bounds, from innermost to outermost for data reader (In) - write_csr(960, tempLoop0_in); - write_csr(961, tempLoop1_in); - write_csr(962, tempLoop2_in); - write_csr(963, tempLoop3_in); - write_csr(964, tempLoop4_in); - - // temporal loop bounds, from innermost to outermost for data writer (Out) - write_csr(965, tempLoop0_out); - write_csr(966, tempLoop1_out); - write_csr(967, tempLoop2_out); - - // temporal strides for data reader (In) - write_csr(968, tempStride0_in); - write_csr(969, tempStride1_in); - write_csr(970, tempStride2_in); - write_csr(971, tempStride3_in); - write_csr(972, tempStride4_in); - - // temporal strides for data writer (Out) - write_csr(973, tempStride0_out); - write_csr(974, tempStride1_out); - write_csr(975, tempStride2_out); - - // fixed spatial strides for data reader (In) - write_csr(976, spatialStride1_in); - - // fixed spatial strides for data writer (Out) - write_csr(977, spatialStride1_out); - - // base ptr for data reader (In) - write_csr(978, (uint32_t)(delta_local_in + snrt_l1_next())); - - // base ptr for data writer (Out) - write_csr(979, (uint32_t)(delta_local_out + snrt_l1_next())); -} - -// Set CSR to start STREAMER -void start_streamer() { write_csr(980, 1); } - -void wait_streamer() { write_csr(980, 0); } - -void set_data_reshuffler(int T2Len, int reduceLen, int opcode) { - // set transpose or not - uint32_t csr0 = ((uint32_t)T2Len << 7) | ((uint32_t)reduceLen << 2) | - ((uint32_t)opcode); - write_csr(982, csr0); -} - -void start_data_reshuffler() { write_csr(983, 1); } - -void wait_data_reshuffler() { write_csr(983, 0); } - -uint32_t read_data_reshuffler_perf_counter() { - uint32_t perf_counter = read_csr(981); - return perf_counter; -} - -void load_data_reshuffler_test_data(int tempLoop0, int tempLoop1, - int tempStride0, int tempStride1, - int spatialStride1, int8_t* base_ptr_local, - int8_t* base_ptr_l2) { - int8_t* addr_in; - int8_t* addr_In; - - for (int loop1 = 0; loop1 < tempLoop1; loop1++) { - for (int loop0 = 0; loop0 < tempLoop0; loop0++) { - for (int spatial_i_1 = 0; spatial_i_1 < spatial_len_1; - spatial_i_1++) { - addr_in = base_ptr_local + loop1 * tempStride1 + - loop0 * tempStride0 + spatial_i_1 * spatialStride1; - addr_In = base_ptr_l2 + loop1 * tempLoop0 * spatial_len + - loop0 * spatial_len + spatial_i_1 * spatial_len_1; - snrt_dma_start_1d(addr_in, addr_In, - spatial_len_0 * sizeof(int8_t)); - } - } - } -} - -uint32_t check_data_reshuffler_result(int tempLoop0, int tempLoop1, - int tempStride0, int tempStride1, - int spatialStride1, - int8_t* base_ptr_local, - int8_t* base_ptr_l2) { - int8_t* addr_out; - int8_t* addr_Out; - uint32_t error = 0; - - for (int loop1 = 0; loop1 < tempLoop1; loop1++) { - for (int loop0 = 0; loop0 < tempLoop0; loop0++) { - for (int spatial_i_1 = 0; spatial_i_1 < spatial_len_1; - spatial_i_1++) { - for (int spatial_i_0 = 0; spatial_i_0 < spatial_len_0; - spatial_i_0++) { - addr_out = base_ptr_local + loop1 * tempStride1 + - loop0 * tempStride0 + - spatial_i_1 * spatialStride1 + spatial_i_0; - addr_Out = base_ptr_l2 + loop1 * tempLoop0 * spatial_len + - loop0 * spatial_len + - spatial_i_1 * spatial_len_1 + spatial_i_0; - if ((int8_t)*addr_out != (int8_t)*addr_Out) { - printf( - "Error: after reshuffle addr_out = %d at address " - "%x, golden addr_Out = %d at address %x \n", - (int8_t)*addr_out, addr_out, (int8_t)*addr_Out, - addr_Out); - error++; - } - } - } - } - } - - return error; -} - -void load_a_chrunk_of_data(int8_t* base_ptr_local, int8_t* base_ptr_l2, - int len) { - snrt_dma_start_1d(base_ptr_local, base_ptr_l2, len * sizeof(int8_t)); -} - -uint32_t test_a_chrunk_of_data(int8_t* base_ptr_local, int8_t* base_ptr_l2, - int len) { - uint32_t error = 0; - for (int i = 0; i < len; i++) { - if ((int8_t)base_ptr_local[i] != (int8_t)base_ptr_l2[i]) { - printf( - "Error: after reshuffle base_ptr_local[%d] = %d, golden " - "base_ptr_l2[%d] = %d \n", - i, (int8_t)base_ptr_local[i], i, (int8_t)base_ptr_l2[i]); - error++; - } - } - return error; -} diff --git a/target/sim/sw/device/snax/streamer-simd/Makefile b/target/sim/sw/device/snax/gemmx/Makefile similarity index 91% rename from target/sim/sw/device/snax/streamer-simd/Makefile rename to target/sim/sw/device/snax/gemmx/Makefile index f935bfba9..f6656a88b 100644 --- a/target/sim/sw/device/snax/streamer-simd/Makefile +++ b/target/sim/sw/device/snax/gemmx/Makefile @@ -14,10 +14,11 @@ include $(MK_DIR)/../common.mk ## Outputs # ############ -OBJS = $(BUILDDIR)/snax-streamer-simd-lib.o +OBJS = $(BUILDDIR)/snax-gemmx-lib.o ALL_OUTPUTS = $(OBJS) INCDIRS += $(abspath include) + ########## ## Rules # ########## @@ -28,5 +29,3 @@ all: $(ALL_OUTPUTS) .PHONY: clean clean: rm -rf $(BUILDDIR) - - diff --git a/target/sim/sw/device/snax/streamer-gemm-conv-simd/include/snax-streamer-gemm-conv-simd-lib.h b/target/sim/sw/device/snax/gemmx/include/snax-gemmx-lib.h similarity index 76% rename from target/sim/sw/device/snax/streamer-gemm-conv-simd/include/snax-streamer-gemm-conv-simd-lib.h rename to target/sim/sw/device/snax/gemmx/include/snax-gemmx-lib.h index d8ac1eda1..d47c6a6b1 100644 --- a/target/sim/sw/device/snax/streamer-gemm-conv-simd/include/snax-streamer-gemm-conv-simd-lib.h +++ b/target/sim/sw/device/snax/gemmx/include/snax-gemmx-lib.h @@ -4,18 +4,28 @@ // // Xiaoling Yi -#include #include "snrt.h" + +#include #include "stdint.h" #pragma once -// load input data from L3 to L1 -void load_conv_input_data(int N, int H, int W, int C, int8_t* base_ptr_local, - int8_t* base_ptr_l2); +// Pack matrix size setting to one CSR +int32_t gen_size_config(uint8_t Batch, uint8_t M, uint8_t K, uint8_t N); + +// Pack two subtraction values to one CSR +int32_t gen_subtraction_config(int8_t subtraction_a, int8_t subtraction_b); + +// generate the configuration for CSR0 +int32_t gen_csr0_config(uint8_t input_zp_i, uint8_t output_zp_i, + uint8_t shift_i, uint8_t max_int_i); + +// generate the configuration for CSR1 +int32_t gen_csr1_config(uint8_t min_int_i, bool double_round_i); -void load_weight_data(int K, int Fy, int Fx, int C, int8_t* base_ptr_local, - int8_t* base_ptr_l2); +// generate the configuration for CSR2 +int32_t gen_csr2_config(uint32_t multiplier_i); // Set STREAMER configuration CSR void set_gemmx_streamer_csr( @@ -37,7 +47,8 @@ void set_gemmx_streamer_csr( int D32tlbound1, int D32tlstride1, int D32tlbound2, int D32tlstride2, int delta_local_a, int delta_local_b, int delta_local_d8, int delta_local_c, - int delta_local_d32, int bypassSIMD); + int delta_local_d32, int bypassSIMD, int32_t transpose_A, + int32_t transpose_B); // Set CSR to start STREAMER void set_gemmx_streamer_start(); diff --git a/target/sim/sw/device/snax/gemmx/include/snax-gemmx-params.h b/target/sim/sw/device/snax/gemmx/include/snax-gemmx-params.h new file mode 100644 index 000000000..cea91a4b9 --- /dev/null +++ b/target/sim/sw/device/snax/gemmx/include/snax-gemmx-params.h @@ -0,0 +1,11 @@ +// Copyright 2023 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Xiaoling Yi + +#pragma once + +#define meshRow 8 +#define tileSize 8 +#define meshCol 8 diff --git a/target/sim/sw/device/snax/gemmx/include/streamer_csr_addr_map.h b/target/sim/sw/device/snax/gemmx/include/streamer_csr_addr_map.h new file mode 100644 index 000000000..f7d6cd9bf --- /dev/null +++ b/target/sim/sw/device/snax/gemmx/include/streamer_csr_addr_map.h @@ -0,0 +1,73 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Xiaoling Yi +// This file is generated by Streamer module in hw/chisel to map the CSR address of Streamer automatically, do not modify it manually +// Generated at 2024-09-19T10:30:33.292846Z + +// CSR Map for READER_0 +#define BASE_PTR_READER_0_LOW 960 +#define BASE_PTR_READER_0_HIGH 961 +#define S_STRIDE_READER_0_0 962 +#define T_BOUND_READER_0_0 963 +#define T_BOUND_READER_0_1 964 +#define T_BOUND_READER_0_2 965 +#define T_BOUND_READER_0_3 966 +#define T_BOUND_READER_0_4 967 +#define T_BOUND_READER_0_5 968 +#define T_STRIDE_READER_0_0 969 +#define T_STRIDE_READER_0_1 970 +#define T_STRIDE_READER_0_2 971 +#define T_STRIDE_READER_0_3 972 +#define T_STRIDE_READER_0_4 973 +#define T_STRIDE_READER_0_5 974 +// CSR Map for READER_1 +#define BASE_PTR_READER_1_LOW 975 +#define BASE_PTR_READER_1_HIGH 976 +#define S_STRIDE_READER_1_0 977 +#define T_BOUND_READER_1_0 978 +#define T_BOUND_READER_1_1 979 +#define T_BOUND_READER_1_2 980 +#define T_STRIDE_READER_1_0 981 +#define T_STRIDE_READER_1_1 982 +#define T_STRIDE_READER_1_2 983 +// CSR Map for WRITER_0 +#define BASE_PTR_WRITER_0_LOW 984 +#define BASE_PTR_WRITER_0_HIGH 985 +#define S_STRIDE_WRITER_0_0 986 +#define T_BOUND_WRITER_0_0 987 +#define T_BOUND_WRITER_0_1 988 +#define T_BOUND_WRITER_0_2 989 +#define T_STRIDE_WRITER_0_0 990 +#define T_STRIDE_WRITER_0_1 991 +#define T_STRIDE_WRITER_0_2 992 +// CSR Map for READER_WRITER_0 +#define BASE_PTR_READER_WRITER_0_LOW 993 +#define BASE_PTR_READER_WRITER_0_HIGH 994 +#define S_STRIDE_READER_WRITER_0_0 995 +#define T_BOUND_READER_WRITER_0_0 996 +#define T_BOUND_READER_WRITER_0_1 997 +#define T_BOUND_READER_WRITER_0_2 998 +#define T_STRIDE_READER_WRITER_0_0 999 +#define T_STRIDE_READER_WRITER_0_1 1000 +#define T_STRIDE_READER_WRITER_0_2 1001 +// CSR Map for READER_WRITER_1 +#define BASE_PTR_READER_WRITER_1_LOW 1002 +#define BASE_PTR_READER_WRITER_1_HIGH 1003 +#define S_STRIDE_READER_WRITER_1_0 1004 +#define T_BOUND_READER_WRITER_1_0 1005 +#define T_BOUND_READER_WRITER_1_1 1006 +#define T_BOUND_READER_WRITER_1_2 1007 +#define T_STRIDE_READER_WRITER_1_0 1008 +#define T_STRIDE_READER_WRITER_1_1 1009 +#define T_STRIDE_READER_WRITER_1_2 1010 +#define TRANSPOSE_CSR_READER_0 1011 +#define TRANSPOSE_CSR_READER_1 1012 +#define TRANSPOSE_EXTENSION_ENABLE +// Other resgiters +// Status register +#define STREAMER_START_CSR 1013 +// Read only CSRs +#define STREAMER_BUSY_CSR 1014 +#define STREAMER_PERFORMANCE_COUNTER_CSR 1015 diff --git a/target/sim/sw/device/snax/gemmx/src/snax-gemmx-lib.c b/target/sim/sw/device/snax/gemmx/src/snax-gemmx-lib.c new file mode 100644 index 000000000..5c2361c5a --- /dev/null +++ b/target/sim/sw/device/snax/gemmx/src/snax-gemmx-lib.c @@ -0,0 +1,262 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Xiaoling Yi + +#include "snax-gemmx-lib.h" +#include +#include "snax-gemmx-params.h" +#include "snrt.h" +#include "stdint.h" +#include "streamer_csr_addr_map.h" + +int32_t gen_size_config(uint8_t Batch, uint8_t M, uint8_t K, uint8_t N) { + return ((int32_t)Batch << 24) | ((int32_t)M << 16) | ((int32_t)K << 8) | + (int32_t)N; +} + +int32_t gen_subtraction_config(int8_t subtraction_a, int8_t subtraction_b) { + return ((uint8_t)subtraction_b << 8) | (uint8_t)subtraction_a; +} + +int32_t gen_csr0_config(uint8_t input_zp_i, uint8_t output_zp_i, + uint8_t shift_i, uint8_t max_int_i) { + // encode the configuration into a single 32-bit integer + return ((int32_t)max_int_i << 24) | ((int32_t)shift_i << 16) | + ((int32_t)output_zp_i << 8) | (int32_t)input_zp_i; +} + +int32_t gen_csr1_config(uint8_t min_int_i, bool double_round_i) { + // encode the configuration into a single 32-bit integer + return ((uint8_t)double_round_i << 8) | (uint8_t)min_int_i; +} + +int32_t gen_csr2_config(uint32_t multiplier_i) { return multiplier_i; } + +// Set STREAMER configuration CSR +void set_gemmx_streamer_csr( + int Aslstride0, int Aslstride1, int Atlbound0, int Atlstride0, + int Atlbound1, int Atlstride1, int Atlbound2, int Atlstride2, int Atlbound3, + int Atlstride3, int Atlbound4, int Atlstride4, int Atlbound5, + int Atlstride5, + + int Bslstride0, int Bslstride1, int Btlbound0, int Btlstride0, + int Btlbound1, int Btlstride1, int Btlbound2, int Btlstride2, + + int D8slstride0, int D8slstride1, int D8tlbound0, int D8tlstride0, + int D8tlbound1, int D8tlstride1, int D8tlbound2, int D8tlstride2, + + int Cslstride0, int Cslstride1, int Ctlbound0, int Ctlstride0, + int Ctlbound1, int Ctlstride1, int Ctlbound2, int Ctlstride2, + + int D32slstride0, int D32slstride1, int D32tlbound0, int D32tlstride0, + int D32tlbound1, int D32tlstride1, int D32tlbound2, int D32tlstride2, + + int delta_local_a, int delta_local_b, int delta_local_d8, int delta_local_c, + int delta_local_d32, int bypassSIMD, int32_t transpose_A, + int32_t transpose_B) { + // base ptr for A + csrw_ss(BASE_PTR_READER_0_LOW, (uint32_t)(delta_local_a + snrt_l1_next())); + + // spatial strides for A + csrw_ss(S_STRIDE_READER_0_0, Aslstride1); + + // loop bounds, from innermost to outermost, for data mover A + csrw_ss(T_BOUND_READER_0_0, Atlbound0); + csrw_ss(T_BOUND_READER_0_1, Atlbound1); + csrw_ss(T_BOUND_READER_0_2, Atlbound2); + csrw_ss(T_BOUND_READER_0_3, Atlbound3); + csrw_ss(T_BOUND_READER_0_4, Atlbound4); + csrw_ss(T_BOUND_READER_0_5, Atlbound5); + + // temporal strides for A + csrw_ss(T_STRIDE_READER_0_0, Atlstride0); + csrw_ss(T_STRIDE_READER_0_1, Atlstride1); + csrw_ss(T_STRIDE_READER_0_2, Atlstride2); + csrw_ss(T_STRIDE_READER_0_3, Atlstride3); + csrw_ss(T_STRIDE_READER_0_4, Atlstride4); + csrw_ss(T_STRIDE_READER_0_5, Atlstride5); + + // base ptr for B + csrw_ss(BASE_PTR_READER_1_LOW, (uint32_t)(delta_local_b + snrt_l1_next())); + + // spatial strides for B + csrw_ss(S_STRIDE_READER_1_0, Bslstride1); + + // loop bounds, from innermost to outermost, for data mover B + csrw_ss(T_BOUND_READER_1_0, Btlbound0); + csrw_ss(T_BOUND_READER_1_1, Btlbound1); + csrw_ss(T_BOUND_READER_1_2, Btlbound2); + + // temporal strides for B + csrw_ss(T_STRIDE_READER_1_0, Btlstride0); + csrw_ss(T_STRIDE_READER_1_1, Btlstride1); + csrw_ss(T_STRIDE_READER_1_2, Btlstride2); + + // base ptr for D8 + csrw_ss(BASE_PTR_WRITER_0_LOW, (uint32_t)(delta_local_d8 + snrt_l1_next())); + + // spatial strides for D8 + csrw_ss(S_STRIDE_WRITER_0_0, D8slstride1); + + // for D8, from N to M + if (bypassSIMD == 0) { + csrw_ss(T_BOUND_WRITER_0_0, D8tlbound0); + csrw_ss(T_BOUND_WRITER_0_1, D8tlbound1); + csrw_ss(T_BOUND_WRITER_0_2, D8tlbound2); + } else { + csrw_ss(T_BOUND_WRITER_0_0, 0); + csrw_ss(T_BOUND_WRITER_0_1, 0); + csrw_ss(T_BOUND_WRITER_0_2, 0); + } + + // temporal strides for D8 + csrw_ss(T_STRIDE_WRITER_0_0, D8tlstride0); + csrw_ss(T_STRIDE_WRITER_0_1, D8tlstride1); + csrw_ss(T_STRIDE_WRITER_0_2, D8tlstride2); + + // base ptr for C + csrw_ss(BASE_PTR_READER_WRITER_0_LOW, + (uint32_t)(delta_local_c + snrt_l1_next())); + + // spatial strides for C + csrw_ss(S_STRIDE_READER_WRITER_0_0, Cslstride1); + + // loop bounds, from innermost to outermost, for data mover C + csrw_ss(T_BOUND_READER_WRITER_0_0, Ctlbound0); + csrw_ss(T_BOUND_READER_WRITER_0_1, Ctlbound1); + csrw_ss(T_BOUND_READER_WRITER_0_2, Ctlbound2); + + // temporal strides for C + csrw_ss(T_STRIDE_READER_WRITER_0_0, Ctlstride0); + csrw_ss(T_STRIDE_READER_WRITER_0_1, Ctlstride1); + csrw_ss(T_STRIDE_READER_WRITER_0_2, Ctlstride2); + + // base ptr for D32 + csrw_ss(BASE_PTR_READER_WRITER_1_LOW, + (uint32_t)(delta_local_d32 + snrt_l1_next())); + + // spatial strides for D32 + csrw_ss(S_STRIDE_READER_WRITER_1_0, D32slstride1); + + // for D32, from N to M + if (bypassSIMD == 0) { + csrw_ss(T_BOUND_READER_WRITER_1_0, 0); + csrw_ss(T_BOUND_READER_WRITER_1_1, 0); + csrw_ss(T_BOUND_READER_WRITER_1_2, 0); + } else { + csrw_ss(T_BOUND_READER_WRITER_1_0, D32tlbound0); + csrw_ss(T_BOUND_READER_WRITER_1_1, D32tlbound1); + csrw_ss(T_BOUND_READER_WRITER_1_2, D32tlbound2); + } + + // temporal strides for D32 + csrw_ss(T_STRIDE_READER_WRITER_1_0, D32tlstride0); + csrw_ss(T_STRIDE_READER_WRITER_1_1, D32tlstride1); + csrw_ss(T_STRIDE_READER_WRITER_1_2, D32tlstride2); + + // set the transpose +#ifdef TRANSPOSE_EXTENSION_ENABLE + csrw_ss(TRANSPOSE_CSR_READER_0, transpose_A == 0 ? 1 : 0); + csrw_ss(TRANSPOSE_CSR_READER_1, transpose_B == 0 ? 1 : 0); +#endif +} + +// Set CSR to start STREAMER +void set_gemmx_streamer_start() { csrw_ss(STREAMER_START_CSR, 1); } + +#define GEMMX_CSR_ADDR_BASE (STREAMER_PERFORMANCE_COUNTER_CSR + 1) +#define T_BOUND_K (GEMMX_CSR_ADDR_BASE + 0) +#define T_BOUND_N (GEMMX_CSR_ADDR_BASE + 1) +#define T_BOUND_M (GEMMX_CSR_ADDR_BASE + 2) + +#define SUBTRACTIONS (GEMMX_CSR_ADDR_BASE + 3) + +#define SIMD_CSR0 (GEMMX_CSR_ADDR_BASE + 4) +#define SIMD_CSR1 (GEMMX_CSR_ADDR_BASE + 5) +#define SIMD_CSR2 (GEMMX_CSR_ADDR_BASE + 6) + +#define TEMPORAL_LOOP_BOUND (GEMMX_CSR_ADDR_BASE + 7) +#define BYPASS_SIMD (GEMMX_CSR_ADDR_BASE + 8) + +#define GEMMX_START (GEMMX_CSR_ADDR_BASE + 9) +#define GEMMX_BUSY (GEMMX_CSR_ADDR_BASE + 10) +#define GEMMX_PERFORMANCE_COUNTER (GEMMX_CSR_ADDR_BASE + 11) + +// Set GEMM configuration CSR +void set_gemmx_csr(int tempLoop0, int tempLoop1, int tempLoop2, + int subtractions, uint32_t csr0, uint32_t csr1, + uint32_t csr2, uint32_t temporal_loop_bound, + uint32_t bypassSIMD) { + // set loop bounds, from innermost to outermost, aka from K to N to M + csrw_ss(T_BOUND_K, tempLoop0); + csrw_ss(T_BOUND_N, tempLoop1); + csrw_ss(T_BOUND_M, tempLoop2); + + // set subtraction a and b + csrw_ss(SUBTRACTIONS, subtractions); + + // set the constants for the SIMD unit + csrw_ss(SIMD_CSR0, csr0); + csrw_ss(SIMD_CSR1, csr1); + csrw_ss(SIMD_CSR2, csr2); + + // set the temporal loop bound + csrw_ss(TEMPORAL_LOOP_BOUND, temporal_loop_bound); + csrw_ss(BYPASS_SIMD, bypassSIMD); +} + +// Set CSR to start GEMM +void set_gemmx_start() { csrw_ss(GEMMX_START, 1); } + +// Stall until Streamer and GEMM accelerator finish +void wait_gemmx_and_streamer() { + csrw_ss(STREAMER_START_CSR, 0); + csrw_ss(STREAMER_START_CSR, 0); + while (csrr_ss(GEMMX_BUSY)) { + } + while (csrr_ss(STREAMER_BUSY_CSR)) { + } + csrw_ss(GEMMX_START, 0); +} + +// Read performance counter of the Streamer, a read-only CSR +uint32_t read_gemmx_streamer_perf_counter() { + uint32_t perf_counter = csrr_ss(STREAMER_PERFORMANCE_COUNTER_CSR); + return perf_counter; +} + +// Read performance counter of GEMM, a read-only CSR +uint32_t read_gemmx_perf_counter() { + uint32_t perf_counter = csrr_ss(GEMMX_PERFORMANCE_COUNTER); + return perf_counter; +} + +uint32_t check_gemmx_result_D8(int8_t* output, int8_t* output_golden, + int32_t Batch, int32_t M, int32_t N) { + uint32_t err = 0; + uint32_t size = 0; + size = Batch * M * N * meshRow * meshCol; + + for (int i = 0; i < size; i++) { + if (output[i] != output_golden[i]) { + err++; + } + } + return err; +} + +uint32_t check_gemmx_result_D32(int32_t* output, int32_t* output_golden, + int32_t Batch, int32_t M, int32_t N) { + uint32_t err = 0; + uint32_t size = 0; + size = Batch * M * N * meshRow * meshCol; + + for (int i = 0; i < size; i++) { + if (output[i] != output_golden[i]) { + err++; + } + } + return err; +} diff --git a/target/sim/sw/device/snax/streamer-gemm-conv-simd/Makefile b/target/sim/sw/device/snax/streamer-gemm-conv-simd/Makefile deleted file mode 100644 index 6f71a1602..000000000 --- a/target/sim/sw/device/snax/streamer-gemm-conv-simd/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2023 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -# Usage of absolute paths is required to externally include -# this Makefile from multiple different locations - -MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -include $(MK_DIR)/../common.mk - -############ -## Outputs # -############ - -OBJS = $(BUILDDIR)/snax-streamer-gemm-conv-simd-lib.o -ALL_OUTPUTS = $(OBJS) - -INCDIRS += $(abspath include) -########## -## Rules # -########## - -.PHONY: all -all: $(ALL_OUTPUTS) - -.PHONY: clean -clean: - rm -rf $(BUILDDIR) - - diff --git a/target/sim/sw/device/snax/streamer-gemm-conv-simd/src/snax-streamer-gemm-conv-simd-lib.c b/target/sim/sw/device/snax/streamer-gemm-conv-simd/src/snax-streamer-gemm-conv-simd-lib.c deleted file mode 100644 index 5d0f7101d..000000000 --- a/target/sim/sw/device/snax/streamer-gemm-conv-simd/src/snax-streamer-gemm-conv-simd-lib.c +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include "snax-streamer-gemm-conv-simd-lib.h" -#include -#include "snrt.h" -#include "stdint.h" - -// load input data from L3 to L1 -void load_conv_input_data(int N, int H, int W, int C, int8_t* base_ptr_local, - int8_t* base_ptr_l2) { - snrt_dma_start_1d(base_ptr_local, base_ptr_l2, - N * H * W * C * sizeof(int8_t)); -} - -void load_weight_data(int K, int Fy, int Fx, int C, int8_t* base_ptr_local, - int8_t* base_ptr_l2) { - snrt_dma_start_1d(base_ptr_local, base_ptr_l2, - K * Fy * Fx * C * sizeof(int8_t)); -} - -// Set STREAMER configuration CSR -void set_gemmx_streamer_csr( - int Aslstride0, int Aslstride1, int Atlbound0, int Atlstride0, - int Atlbound1, int Atlstride1, int Atlbound2, int Atlstride2, int Atlbound3, - int Atlstride3, int Atlbound4, int Atlstride4, int Atlbound5, - int Atlstride5, - - int Bslstride0, int Bslstride1, int Btlbound0, int Btlstride0, - int Btlbound1, int Btlstride1, int Btlbound2, int Btlstride2, - - int D8slstride0, int D8slstride1, int D8tlbound0, int D8tlstride0, - int D8tlbound1, int D8tlstride1, int D8tlbound2, int D8tlstride2, - - int Cslstride0, int Cslstride1, int Ctlbound0, int Ctlstride0, - int Ctlbound1, int Ctlstride1, int Ctlbound2, int Ctlstride2, - - int D32slstride0, int D32slstride1, int D32tlbound0, int D32tlstride0, - int D32tlbound1, int D32tlstride1, int D32tlbound2, int D32tlstride2, - - int delta_local_a, int delta_local_b, int delta_local_d8, int delta_local_c, - int delta_local_d32, int bypassSIMD) { - // loop bounds, from innermost to outermost, for data mover A - write_csr(960, Atlbound0); - write_csr(961, Atlbound1); - write_csr(962, Atlbound2); - write_csr(963, Atlbound3); - write_csr(964, Atlbound4); - write_csr(965, Atlbound5); - - // loop bounds, from innermost to outermost, for data mover B - write_csr(966, Btlbound0); - write_csr(967, Btlbound1); - write_csr(968, Btlbound2); - - // for D8, from N to M - if (bypassSIMD == 0) { - write_csr(969, D8tlbound0); - write_csr(970, D8tlbound1); - write_csr(971, D8tlbound2); - } else { - write_csr(969, 0); - write_csr(970, 0); - write_csr(971, 0); - } - - // loop bounds, from innermost to outermost, for data mover C - write_csr(972, Ctlbound0); - write_csr(973, Ctlbound1); - write_csr(974, Ctlbound2); - - // for D32, from N to M - if (bypassSIMD == 0) { - write_csr(975, 0); - write_csr(976, 0); - write_csr(977, 0); - } else { - write_csr(975, D32tlbound0); - write_csr(976, D32tlbound1); - write_csr(977, D32tlbound2); - } - - // temporal strides for A - write_csr(978, Atlstride0); - write_csr(979, Atlstride1); - write_csr(980, Atlstride2); - write_csr(981, Atlstride3); - write_csr(982, Atlstride4); - write_csr(983, Atlstride5); - - // temporal strides for B - write_csr(984, Btlstride0); - write_csr(985, Btlstride1); - write_csr(986, Btlstride2); - - // temporal strides for D8 - write_csr(987, D8tlstride0); - write_csr(988, D8tlstride1); - write_csr(989, D8tlstride2); - - // temporal strides for C - write_csr(990, Ctlstride0); - write_csr(991, Ctlstride1); - write_csr(992, Ctlstride2); - - // temporal strides for D32 - write_csr(993, D32tlstride0); - write_csr(994, D32tlstride1); - write_csr(995, D32tlstride2); - - // spatial strides for A - write_csr(996, Aslstride0); - write_csr(997, Aslstride1); - - // spatial strides for B - write_csr(998, Bslstride0); - write_csr(999, Bslstride1); - - // spatial strides for D8 - write_csr(1000, D8slstride0); - write_csr(1001, D8slstride1); - - // spatial strides for C - write_csr(1002, Cslstride0); - write_csr(1003, Cslstride1); - - // spatial strides for D32 - write_csr(1004, D32slstride0); - write_csr(1005, D32slstride1); - - // base ptr for A - write_csr(1006, (uint32_t)(delta_local_a + snrt_l1_next())); - - // base ptr for B - write_csr(1007, (uint32_t)(delta_local_b + snrt_l1_next())); - - // base ptr for D8 - write_csr(1008, (uint32_t)(delta_local_d8 + snrt_l1_next())); - - // base ptr for C - write_csr(1009, (uint32_t)(delta_local_c + snrt_l1_next())); - - // base ptr for D32 - write_csr(1010, (uint32_t)(delta_local_d32 + snrt_l1_next())); -} - -// Set CSR to start STREAMER -void set_gemmx_streamer_start() { write_csr(1011, 1); } - -// Set GEMM configuration CSR -void set_gemmx_csr(int tempLoop0, int tempLoop1, int tempLoop2, - int subtractions, uint32_t csr0, uint32_t csr1, - uint32_t csr2, uint32_t temporal_loop_bound, - uint32_t bypassSIMD) { - // set loop bounds, from innermost to outermost, aka from K to N to M - write_csr(1014, tempLoop0); - write_csr(1015, tempLoop1); - write_csr(1016, tempLoop2); - - // set subtraction a and b - write_csr(1017, subtractions); - - // set the constants for the SIMD unit - write_csr(1018, csr0); - write_csr(1019, csr1); - write_csr(1020, csr2); - - // set the temporal loop bound - write_csr(1021, temporal_loop_bound); - write_csr(1022, bypassSIMD); -} - -// Set CSR to start GEMM -void set_gemmx_start() { write_csr(1023, 1); } - -// Stall until Streamer and GEMM accelerator finish -void wait_gemmx_and_streamer() { - write_csr(1011, 0); - write_csr(1011, 0); - write_csr(1023, 0); -} - -// Read performance counter of the Streamer, a read-only CSR -uint32_t read_gemmx_streamer_perf_counter() { - uint32_t perf_counter = read_csr(1013); - return perf_counter; -} - -// Read performance counter of GEMM, a read-only CSR -uint32_t read_gemmx_perf_counter() { - uint32_t perf_counter = read_csr(1025); - return perf_counter; -} - -uint32_t check_gemmx_result_D8(int8_t* output, int8_t* output_golden, - int32_t Batch, int32_t M, int32_t N) { - uint32_t err = 0; - uint32_t size = 0; - size = Batch * M * N * 8 * 8; - - for (int i = 0; i < size; i++) { - if (output[i] != output_golden[i]) { - err++; - } - } - return err; -} - -uint32_t check_gemmx_result_D32(int32_t* output, int32_t* output_golden, - int32_t Batch, int32_t M, int32_t N) { - uint32_t err = 0; - uint32_t size = 0; - size = Batch * M * N * 8 * 8; - - for (int i = 0; i < size; i++) { - if (output[i] != output_golden[i]) { - err++; - } - } - return err; -} diff --git a/target/sim/sw/device/snax/streamer-gemm-conv/Makefile b/target/sim/sw/device/snax/streamer-gemm-conv/Makefile deleted file mode 100755 index c046d6465..000000000 --- a/target/sim/sw/device/snax/streamer-gemm-conv/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -include $(MK_DIR)/../common.mk - -############ -## Outputs # -############ - -OBJS = $(BUILDDIR)/snax-streamer-gemm-conv-lib.o -ALL_OUTPUTS = $(OBJS) -INCDIRS += $(abspath include) - -########## -## Rules # -########## - -.PHONY: all -all: $(ALL_OUTPUTS) - -.PHONY: clean -clean: - rm -rf $(BUILDDIR) diff --git a/target/sim/sw/device/snax/streamer-gemm-conv/include/snax-streamer-gemm-conv-lib.h b/target/sim/sw/device/snax/streamer-gemm-conv/include/snax-streamer-gemm-conv-lib.h deleted file mode 100755 index 819f1d4d8..000000000 --- a/target/sim/sw/device/snax/streamer-gemm-conv/include/snax-streamer-gemm-conv-lib.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include -#include "snrt.h" -#include "stdint.h" - -#pragma once - -// load input data from L3 to L1 -void load_conv_input_data(int N, int H, int W, int C, int8_t* base_ptr_local, - int8_t* base_ptr_l2); - -void load_weight_data(int K, int Fy, int Fx, int C, int8_t* base_ptr_local, - int8_t* base_ptr_l2); - -// Set STREAMER configuration CSR -void set_conv_streamer_csr( - int Aslstride0, int Aslstride1, int Atlbound0, int Atlstride0, - int Atlbound1, int Atlstride1, int Atlbound2, int Atlstride2, int Atlbound3, - int Atlstride3, int Atlbound4, int Atlstride4, int Atlbound5, - int Atlstride5, int Bslstride0, int Bslstride1, int Btlbound0, - int Btlstride0, int Btlbound1, int Btlstride1, int Btlbound2, - int Btlstride2, int Cslstride0, int Cslstride1, int Ctlbound0, - int Ctlstride0, int Ctlbound1, int Ctlstride1, int Ctlbound2, - int Ctlstride2, int delta_local_a, int delta_local_b, int delta_local_c); - -// Set CSR to start STREAMER -void set_conv_streamer_start(); - -// Set GEMM configuration CSR -void set_conv_block_gemm_csr(int tempLoop0, int tempLoop1, int tempLoop2, - int subtractions); - -// Set CSR to start GEMM -void set_conv_block_gemm_start(); - -// Poll until Streamer and GEMM accelerator finish -void wait_conv_streamer_gemm(); - -// Read performance counter of the Streamer, a read-only CSR -uint32_t read_conv_gemm_streamer_perf_counter(); - -// Read performance counter of GEMM, a read-only CSR -uint32_t read_conv_gemm_perf_counter(); - -// Check the result of the implicit im2col convolution -uint32_t check_conv_result(int32_t* output, int32_t* output_golden, - int32_t Batch, int32_t M, int32_t N); diff --git a/target/sim/sw/device/snax/streamer-gemm-conv/src/snax-streamer-gemm-conv-lib.c b/target/sim/sw/device/snax/streamer-gemm-conv/src/snax-streamer-gemm-conv-lib.c deleted file mode 100755 index f406d5cce..000000000 --- a/target/sim/sw/device/snax/streamer-gemm-conv/src/snax-streamer-gemm-conv-lib.c +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2024 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include -#include "snrt.h" -#include "stdint.h" - -// load input data from L3 to L1 -void load_conv_input_data(int N, int H, int W, int C, int8_t* base_ptr_local, - int8_t* base_ptr_l2) { - snrt_dma_start_1d(base_ptr_local, base_ptr_l2, - N * H * W * C * sizeof(int8_t)); -} - -void load_weight_data(int K, int Fy, int Fx, int C, int8_t* base_ptr_local, - int8_t* base_ptr_l2) { - snrt_dma_start_1d(base_ptr_local, base_ptr_l2, - K * Fy * Fx * C * sizeof(int8_t)); -} - -// Set STREAMER configuration CSR -void set_conv_streamer_csr( - int Aslstride0, int Aslstride1, int Atlbound0, int Atlstride0, - int Atlbound1, int Atlstride1, int Atlbound2, int Atlstride2, int Atlbound3, - int Atlstride3, int Atlbound4, int Atlstride4, int Atlbound5, - int Atlstride5, int Bslstride0, int Bslstride1, int Btlbound0, - int Btlstride0, int Btlbound1, int Btlstride1, int Btlbound2, - int Btlstride2, int Cslstride0, int Cslstride1, int Ctlbound0, - int Ctlstride0, int Ctlbound1, int Ctlstride1, int Ctlbound2, - int Ctlstride2, int delta_local_a, int delta_local_b, int delta_local_c) { - // loop bounds, from innermost to outermost, for data mover A - write_csr(960, Atlbound0); - write_csr(961, Atlbound1); - write_csr(962, Atlbound2); - write_csr(963, Atlbound3); - write_csr(964, Atlbound4); - write_csr(965, Atlbound5); - - // loop bounds, from innermost to outermost, for data mover B - write_csr(966, Btlbound0); - write_csr(967, Btlbound1); - write_csr(968, Btlbound2); - - // loop bounds, from innermost to outermost, for data mover C - write_csr(969, Ctlbound0); - write_csr(970, Ctlbound1); - write_csr(971, Ctlbound2); - - // temporal strides for A - write_csr(972, Atlstride0); - write_csr(973, Atlstride1); - write_csr(974, Atlstride2); - write_csr(975, Atlstride3); - write_csr(976, Atlstride4); - write_csr(977, Atlstride5); - - // temporal strides for B - write_csr(978, Btlstride0); - write_csr(979, Btlstride1); - write_csr(980, Btlstride2); - - // temporal strides for C - write_csr(981, Ctlstride0); - write_csr(982, Ctlstride1); - write_csr(983, Ctlstride2); - - // spatial strides for A - write_csr(984, Aslstride0); - write_csr(985, Aslstride1); - - // spatial strides for B - write_csr(986, Bslstride0); - write_csr(987, Bslstride1); - - // spatial strides for C - write_csr(988, Cslstride0); - write_csr(989, Cslstride1); - - // base ptr for A - write_csr(990, (uint32_t)(delta_local_a + snrt_l1_next())); - - // base ptr for B - write_csr(991, (uint32_t)(delta_local_b + snrt_l1_next())); - - // base ptr for C - write_csr(992, (uint32_t)(delta_local_c + snrt_l1_next())); -} - -// Set CSR to start STREAMER -void set_conv_streamer_start() { write_csr(993, 1); } - -// Set GEMM configuration CSR -void set_conv_block_gemm_csr(int tempLoop0, int tempLoop1, int tempLoop2, - int subtractions) { - // set loop bounds, from innermost to outermost, aka from K to N to M - write_csr(995, tempLoop0); - write_csr(996, tempLoop1); - write_csr(997, tempLoop2); - - // set subtraction a and b - write_csr(998, subtractions); -} - -// Set CSR to start GEMM -void set_conv_block_gemm_start() { write_csr(999, 1); } - -// Stall until Streamer and GEMM accelerator finish -void wait_conv_streamer_gemm() { - write_csr(999, 0); - write_csr(999, 0); - write_csr(993, 0); -} - -// Read performance counter of the Streamer, a read-only CSR -uint32_t read_conv_gemm_streamer_perf_counter() { - uint32_t perf_counter = read_csr(994); - return perf_counter; -} - -// Read performance counter of GEMM, a read-only CSR -uint32_t read_conv_gemm_perf_counter() { - uint32_t perf_counter = read_csr(1000); - return perf_counter; -} - -// Check the result of the implicit im2col convolution -uint32_t check_conv_result(int32_t* output, int32_t* output_golden, - int32_t Batch, int32_t M, int32_t N) { - uint32_t err = 0; - for (int i = 0; i < Batch * M * N * 8 * 8; i++) { - if (output[i] != output_golden[i]) { - err++; - } - } - return err; -} diff --git a/target/sim/sw/device/snax/streamer-gemm/Makefile b/target/sim/sw/device/snax/streamer-gemm/Makefile deleted file mode 100644 index 085470dcf..000000000 --- a/target/sim/sw/device/snax/streamer-gemm/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2023 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -# Usage of absolute paths is required to externally include -# this Makefile from multiple different locations - -MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -include $(MK_DIR)/../common.mk - -############ -## Outputs # -############ - -OBJS = $(BUILDDIR)/snax-streamer-gemm-lib.o -ALL_OUTPUTS = $(OBJS) -INCDIRS += $(abspath include) - -########## -## Rules # -########## - -.PHONY: all -all: $(ALL_OUTPUTS) - -.PHONY: clean -clean: - rm -rf $(BUILDDIR) - - diff --git a/target/sim/sw/device/snax/streamer-gemm/include/snax-streamer-gemm-lib.h b/target/sim/sw/device/snax/streamer-gemm/include/snax-streamer-gemm-lib.h deleted file mode 100644 index eb94222fc..000000000 --- a/target/sim/sw/device/snax/streamer-gemm/include/snax-streamer-gemm-lib.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2023 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include -#include "snrt.h" -#include "stdint.h" - -#pragma once - -// Set STREAMER configuration CSR -void set_streamer_csr(int tempLoop0, int tempLoop1, int tempLoop2, - int tempStride0A, int tempStride2A, int sptialStride1A, int tempStride0B, - int tempStride1B,int sptialStride1B, int tempStride1C, int tempStride2C, int sptialStride1C, - int delta_local_a, int delta_local_b, int delta_local_c); - -// Set CSR to start STREAMER -void set_streamer_start(); - -// Set GEMM configuration CSR -void set_block_gemm_csr(int tempLoop0, int tempLoop1, int tempLoop2, - int subtractions); - -// Set CSR to start GEMM -void set_block_gemm_start(); - -// Poll until Streamer and GEMM accelerator finish -void wait_streamer_gemm(); - -void start_gemm_then_wait_streamer_gemm(); - -uint32_t read_gemm_streamer_perf_counter(); - -uint32_t read_gemm_perf_counter(); diff --git a/target/sim/sw/device/snax/streamer-gemm/src/snax-streamer-gemm-lib.c b/target/sim/sw/device/snax/streamer-gemm/src/snax-streamer-gemm-lib.c deleted file mode 100644 index 6c88f7f6e..000000000 --- a/target/sim/sw/device/snax/streamer-gemm/src/snax-streamer-gemm-lib.c +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2023 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include "snax-streamer-gemm-lib.h" - -// Set STREAMER configuration CSR -void set_streamer_csr(int tempLoop0, int tempLoop1, int tempLoop2, - int tempStride0A, int tempStride2A, int sptialStride1A, int tempStride0B, - int tempStride1B,int sptialStride1B, int tempStride1C, int tempStride2C, int sptialStride1C, - int delta_local_a, int delta_local_b, int delta_local_c) { - // loop bounds, from innermost to outermost, from K to N to M - write_csr(960, tempLoop0); - write_csr(961, tempLoop1); - write_csr(962, tempLoop2); - - // temporal strides for A - write_csr(963, tempStride0A); - write_csr(964, 0); - write_csr(965, tempStride2A); - - // temporal strides for B - write_csr(966, tempStride0B); - write_csr(967, tempStride1B); - write_csr(968, 0); - - // temporal strides for C - write_csr(969, 0); - write_csr(970, tempStride1C); - write_csr(971, tempStride2C); - - // spatial strides for A - write_csr(972, 1); - write_csr(973, sptialStride1A); - - // spatial strides for B - write_csr(974, 1); - write_csr(975, sptialStride1B); - - // spatial strides for C - write_csr(976, 4); - write_csr(977, sptialStride1C); - - // base ptr for A - write_csr(978, (uint32_t)(delta_local_a + snrt_l1_next())); - - // base ptr for B - write_csr(979, (uint32_t)(delta_local_b + snrt_l1_next())); - - // base ptr for C - write_csr(980, (uint32_t)(delta_local_c + snrt_l1_next())); -} - -// Set CSR to start STREAMER -void set_streamer_start() { write_csr(982, 1); } - -// Set GEMM configuration CSR -void set_block_gemm_csr(int tempLoop0, int tempLoop1, int tempLoop2, - int subtractions) { - // set loop bounds, from innermost to outermost, aka from K to N to M - write_csr(983, tempLoop0); - write_csr(984, tempLoop1); - write_csr(985, tempLoop2); - - // set subtraction a and b - write_csr(986, subtractions); -} - -// Set CSR to start GEMM -void set_block_gemm_start() { write_csr(988, 1); } - -// Poll until Streamer and GEMM accelerator finish -void wait_streamer_gemm() { - write_csr(988, 0); - write_csr(988, 0); - write_csr(982, 0); -} - -void start_gemm_then_wait_streamer_gemm(){ - snrt_mcycle(); - write_csr(988, 1); - write_csr(988, 0); - write_csr(988, 0); - write_csr(982, 0); - snrt_mcycle(); - - // write_csr(981, 0); - -} - -uint32_t read_gemm_streamer_perf_counter(){ - uint32_t perf_counter = read_csr(981); - return perf_counter; -} - -uint32_t read_gemm_perf_counter(){ - uint32_t perf_counter = read_csr(987); - return perf_counter; -} diff --git a/target/sim/sw/device/snax/streamer-simd/include/snax-streamer-simd-lib.h b/target/sim/sw/device/snax/streamer-simd/include/snax-streamer-simd-lib.h deleted file mode 100644 index 801370c46..000000000 --- a/target/sim/sw/device/snax/streamer-simd/include/snax-streamer-simd-lib.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2023 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include -#include "snrt.h" -#include "stdint.h" - -#pragma once - -// the spatial unrolling of simd -# define vec_len 64 - -// set the address of the first CSR -// uint32_t csr_offset = 1024; - -// generate the configuration for CSR0 -int32_t gen_csr0_config(uint8_t input_zp_i, uint8_t output_zp_i, - uint8_t shift_i, uint8_t max_int_i); - -// generate the configuration for CSR1 -int32_t gen_csr1_config(uint8_t min_int_i, bool double_round_i); - -// generate the configuration for CSR2 -int32_t gen_csr2_config(uint32_t multiplier_i); - -// set the configuration for the streamer -void set_streamer_simd_csr(int tempLoop0, int tempLoop1, int tempStride0_in, - int tempStride1_in, int tempStride0_out, - int tempStride1_out, int32_t delta_local_in, int32_t delta_local_out); - -// start the streamer -void start_streamer_simd(); - -// set the configuration for the SIMD -void set_simd_csr(uint32_t csr0, uint32_t csr1, uint32_t csr2, uint32_t temporal_loop_bound); - -// start the SIMD -void start_simd(); - -// wait for the streamer to finish -void wait_streamer_simd(); - -void start_simd_then_wait_streamer_simd(); - -uint32_t read_simd_streamer_perf_counter(); - -uint32_t read_simd_perf_counter(); - -// load the test data into TCDM -void load_simd_test_data(int tempLoop0, int tempLoop1, int tempStride0, - int tempStride1, int32_t* base_ptr_local, - int32_t* base_ptr_l2); - -// c specification of the post processing -int8_t scale_quant_clamp_c_spec(int32_t input, int8_t input_zp, int8_t output_zp, - int32_t multiplier, - int8_t shift, // values between 0-63 - int8_t max_int, int8_t min_int, bool double_round); - -// check the result of the SIMD -uint32_t check_simd_result(int tempLoop0, int tempLoop1, int tempStride0, - int tempStride1, int8_t* base_ptr_local, - int8_t* base_ptr_l2); diff --git a/target/sim/sw/device/snax/streamer-simd/src/snax-streamer-simd-lib.c b/target/sim/sw/device/snax/streamer-simd/src/snax-streamer-simd-lib.c deleted file mode 100644 index c0ad4df70..000000000 --- a/target/sim/sw/device/snax/streamer-simd/src/snax-streamer-simd-lib.c +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright 2023 KU Leuven. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Xiaoling Yi - -#include "snax-streamer-simd-lib.h" - -int32_t gen_csr0_config(uint8_t input_zp_i, uint8_t output_zp_i, - uint8_t shift_i, uint8_t max_int_i) -{ - // encode the configuration into a single 32-bit integer - return ((int32_t)max_int_i << 24) | ((int32_t)shift_i << 16) | - ((int32_t)output_zp_i << 8) | (int32_t)input_zp_i; -} - -int32_t gen_csr1_config(uint8_t min_int_i, bool double_round_i) { - // encode the configuration into a single 32-bit integer - return ((uint8_t)double_round_i << 8) | (uint8_t)min_int_i; -} - -int32_t gen_csr2_config(uint32_t multiplier_i) { return multiplier_i; } - -void set_streamer_simd_csr(int tempLoop0, int tempLoop1, int tempStride0_in, - int tempStride1_in, int tempStride0_out, - int tempStride1_out, int32_t delta_local_in, int32_t delta_local_out) { - - // temporal loop bounds, from innermost to outermost - write_csr(960+0, tempLoop0); - write_csr(960+1, tempLoop1); - - // temporal strides for data reader (In) - write_csr(960+2, tempStride0_in); - write_csr(960+3, tempStride1_in); - - // temporal strides for data writer (Out) - write_csr(960+4, tempStride0_out); - write_csr(960+5, tempStride1_out); - - // fixed spatial strides for data reader (In) - write_csr(960+6, 4); - write_csr(960+7, 32); - - // fixed spatial strides for data writer (Out) - write_csr(960+8, 1); - write_csr(960+9, 8); - - // base ptr for data reader (In) - write_csr(960+10, (uint32_t)(delta_local_in + snrt_l1_next())); - - // base ptr for data writer (Out) - write_csr(960+11, (uint32_t)(delta_local_out + snrt_l1_next())); -} - -void start_streamer_simd() { write_csr(960+13, 1); } - -void set_simd_csr(uint32_t csr0, uint32_t csr1, uint32_t csr2, uint32_t temporal_loop_bound) { - // set the constants for the SIMD unit - write_csr(960+14, csr0); - write_csr(960+15, csr1); - write_csr(960+16, csr2); - - // set the temporal loop bound - write_csr(960+17, temporal_loop_bound); -} - -void start_simd() { write_csr(960+19, 1); } - -void wait_streamer_simd() { - write_csr(960+19, 0); - write_csr(960+19, 0); - write_csr(960+13, 0); -} - -void start_simd_then_wait_streamer_simd(){ - write_csr(960+19, 1); - write_csr(960+19, 0); - write_csr(960+19, 0); - write_csr(960+13, 0); -} - -uint32_t read_simd_streamer_perf_counter(){ - uint32_t perf_counter = read_csr(960 + 12); - return perf_counter; -} - -uint32_t read_simd_perf_counter(){ - uint32_t perf_counter = read_csr(960+18); - return perf_counter; -} - -void load_simd_test_data(int tempLoop0, int tempLoop1, int tempStride0, - int tempStride1, int32_t* base_ptr_local, - int32_t* base_ptr_l2) { - int32_t* addr_in; - int32_t* addr_In; - - for (int loop1 = 0; loop1 < tempLoop1; loop1++) { - for (int loop0 = 0; loop0 < tempLoop0; loop0++) { - addr_in = - base_ptr_local + (loop1 * tempStride1 + loop0 * tempStride0) / sizeof(int32_t); - addr_In = - base_ptr_l2 + loop1 * tempLoop0 * vec_len + loop0 * vec_len; - snrt_dma_start_1d(addr_in, addr_In, vec_len * sizeof(int32_t)); - } - } -} - -int8_t scale_quant_clamp_c_spec(int32_t input, int8_t input_zp, int8_t output_zp, - int32_t multiplier, - int8_t shift, // values between 0-63 - int8_t max_int, int8_t min_int, bool double_round) { - - // input zero-point adjustment - input = input - input_zp; - - // multiplication - int64_t var0 = (int64_t)input * (int64_t)multiplier; - - // shift & round - int32_t var1 = var0 >> (shift - 1); - - if (double_round) { - if (var1 >= 0) - var1 += 1; - else - var1 -= 1; - } - var1 = var1 >> 1; - - // output zero-point adjustment - var1 = var1 + output_zp; - - // clamping - if (var1 > max_int) - var1 = max_int; - if (var1 < min_int) - var1 = min_int; - - int8_t result = (int8_t)var1; - return result; -} - -uint32_t check_simd_result(int tempLoop0, int tempLoop1, int tempStride0, - int tempStride1, int8_t* base_ptr_local, - int8_t* base_ptr_l2) { - int8_t* addr_out; - int8_t* addr_Out; - uint32_t error = 0; - - for (int loop1 = 0; loop1 < tempLoop1; loop1++) { - for (int loop0 = 0; loop0 < tempLoop0; loop0++) { - for (int i = 0; i < vec_len; i++) { - addr_out = - base_ptr_local + (loop1 * tempStride1 + loop0 * tempStride0) + i; - addr_Out = - base_ptr_l2 + loop1 * tempLoop0 * vec_len + loop0 * vec_len + i; - if ((int8_t)*addr_out != (int8_t)*addr_Out) { - error++; - } - } - } - } - return error; -} diff --git a/target/sim/sw/host/apps/offload/Makefile b/target/sim/sw/host/apps/offload/Makefile index 3df56214e..0e1aa2549 100644 --- a/target/sim/sw/host/apps/offload/Makefile +++ b/target/sim/sw/host/apps/offload/Makefile @@ -32,9 +32,7 @@ RUNTIME_DIR = $(abspath $(HOST_DIR)/runtime) DEVICE_DIR = $(abspath $(HOST_DIR)/../device) # now we only include the snax app -DEVICE_APPS += snax/snax-data-reshuffler -DEVICE_APPS += snax/snax-streamer-gemm-conv -DEVICE_APPS += snax/snax-streamer-gemm-conv-simd +DEVICE_APPS += snax/snax-gemmx DEVICE_APPS += snax/snax-test-integration DEVICE_APPS += snax/snax-hypercorex-test-csr DEVICE_APPS += snax/snax-hypercorex-char-recog diff --git a/target/sim/sw/sim_elf.yaml b/target/sim/sw/sim_elf.yaml index bdc2c3fc0..a9d00d5f7 100644 --- a/target/sim/sw/sim_elf.yaml +++ b/target/sim/sw/sim_elf.yaml @@ -4,3 +4,4 @@ runs: - elf: host/apps/offload/build/offload-snax-test-integration.elf + - elf: host/apps/offload/build/offload-snax-gemmx.elf diff --git a/util/sim/snax_utils.py b/util/sim/snax_utils.py deleted file mode 100644 index c61779234..000000000 --- a/util/sim/snax_utils.py +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Xiaoling Yi - -import numpy as np - - -# Function to perform 2D convolution on the input data using the specified kernel, -# stride, and padding. It returns the output feature map. -def conv2d(input_data, kernel, stride=(1, 1), padding=(0, 0), mode="NHWC"): - if mode == "NHWC": - batch_size, in_height, in_width, in_channels = input_data.shape - out_channels, kernel_height, kernel_width, _ = kernel.shape - stride_h, stride_w = stride - pad_h, pad_w = padding - - # Calculate the output feature map dimensions - out_height = (in_height - kernel_height + 2 * pad_h) // stride_h + 1 - out_width = (in_width - kernel_width + 2 * pad_w) // stride_w + 1 - - # Add padding - input_data_padded = np.pad( - input_data, - ((0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), - mode="constant", - ) - - # Initialize the output feature map - output_data = np.zeros( - (batch_size, out_height, out_width, out_channels), np.int32 - ) - - # Perform the convolution operation - for b in range(batch_size): - for oc in range(out_channels): - for oh in range(out_height): - for ow in range(out_width): - # Calculate the input region - ih_start = oh * stride_h - ih_end = ih_start + kernel_height - iw_start = ow * stride_w - iw_end = iw_start + kernel_width - - # Slice to extract the input region - input_region = input_data_padded[ - b, ih_start:ih_end, iw_start:iw_end, : - ] - - # Slice to extract the corresponding convolution kernel - conv_kernel = kernel[oc, :, :, :] - - # Perform the convolution calculation - output_data[b, oh, ow, oc] = np.sum(input_region * conv_kernel) - else: - batch_size, Cin8, in_height, in_width, t = input_data.shape - assert t == 8 - Cout8, Cin8, kernel_height, kernel_width, t1, t2 = kernel.shape - assert t1 == 8 - assert t2 == 8 - stride_h, stride_w = stride - pad_h, pad_w = padding - - # Calculate the output feature map dimensions - out_height = (in_height - kernel_height + 2 * pad_h) // stride_h + 1 - out_width = (in_width - kernel_width + 2 * pad_w) // stride_w + 1 - - # Add padding - input_data_padded = np.pad( - input_data, - ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), - mode="constant", - ) - - # Initialize the output feature map - output_data = np.zeros( - (batch_size, Cout8, out_height, out_width // 8, 8, 8), np.int32 - ) - - # Perform the convolution operation - for b in range(batch_size): - for oc in range(Cout8): - for oc8 in range(8): - for oh in range(out_height): - for ow in range(out_width // 8): - for ow8 in range(8): - # Calculate the input region - iw_start = (ow * 8 + ow8) * stride_w - iw_end = iw_start + kernel_width - - ih_start = oh * stride_h - ih_end = ih_start + kernel_height - - # Slice to extract the input region - input_region = input_data_padded[ - b, :, ih_start:ih_end, iw_start:iw_end, : - ] - - # Slice to extract the corresponding convolution kernel - conv_kernel = kernel[oc, :, :, :, oc8, :] - - # Perform the convolution calculation - output_data[b, oc, oh, ow, ow8, oc8] = np.sum( - input_region * conv_kernel - ) - - return output_data - - -# Function to transform input data into columns for efficient convolution operations. -# It returns the transformed input data and reshaped kernel. -def im2col(input_data, kernel, stride=(1, 1), padding=(0, 0)): - batch_size, in_height, in_width, in_channels = input_data.shape - out_channels, kernel_height, kernel_width, _ = kernel.shape - stride_h, stride_w = stride - pad_h, pad_w = padding - - # Calculate the size of the output feature map - out_height = (in_height + 2 * pad_h - kernel_height) // stride_h + 1 - out_width = (in_width + 2 * pad_w - kernel_width) // stride_w + 1 - - # Apply zero padding to the input data - input_data_padded = np.pad( - input_data, ((0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), mode="constant" - ) - - # Initialize the im2col matrix - im2col_matrix = np.zeros( - (batch_size, out_height * out_width, in_channels * kernel_height * kernel_width) - ) - - # Perform the im2col transformation on the input data - for b in range(batch_size): - for oh in range(out_height): - for ow in range(out_width): - # Calculate the input region - ih_start = oh * stride_h - ih_end = ih_start + kernel_height - iw_start = ow * stride_w - iw_end = iw_start + kernel_width - - # Slice and extract the input region - input_region = input_data_padded[b, ih_start:ih_end, iw_start:iw_end, :] - - # Flatten the input region into a 1D vector and add it to the - # corresponding position in the im2col matrix - im2col_matrix[b, oh * out_width + ow, :] = input_region.reshape(-1) - - im2col_matrix = im2col_matrix.reshape(batch_size * out_height * out_width, -1) - im2col_kernel = kernel.reshape(out_channels, -1).T - - return im2col_matrix, im2col_kernel - - -# Golden model function to perform block matrix multiplication with specific parameters. -# It returns the resulting matrix after the computation. -def block_gemm_golden_model( - m, k, n, row, size, col, a, b, subtraction_a, subtraction_b, c -): - d = np.zeros(m * row * n * col, dtype=(np.int32)) - for mm in range(m): - for nn in range(n): - for kk in range(k): - for rr in range(row): - for cc in range(col): - for ss in range(size): - c_index = ( - mm * n * row * col + nn * row * col + rr * col + cc - ) - a_index = ( - mm * k * row * size + kk * row * size + rr * size + ss - ) - b_index = ( - nn * k * size * col + kk * size * col + cc * size + ss - ) - d[c_index] = d[c_index] + (a[a_index] - subtraction_a) * ( - b[b_index] - subtraction_b - ) - d = np.add(c, d) - return d - - -# This function Performs a tiled block General Matrix Multiply (GEMM) operation. -# -# This function breaks down large matrix multiplication into smaller submatrices -# (tiles) and performs GEMM on these submatrices. The results are then accumulated -# into a final result matrix. -# -# Parameters: -# m2, k2, n2: int -# The number of tiles in each dimension. -# m, k, n: int -# The dimensions of the submatrices for block matrix multiplication. -# row, size, col: int -# Size parameters for the submatrices in the hardware gemm accelerator. -# a, b, c: numpy.ndarray -# The input matrices. -# subtraction_a, subtraction_b: bool -# Flags indicating whether to perform subtraction in the GEMM computation. -# -# Returns: -# numpy.ndarray -# The result of the tiled GEMM operation as a flattened array. -def tiled_block_gemm_golden_model( - m2, k2, n2, m, k, n, row, size, col, a, b, subtraction_a, subtraction_b, c -): - # Create an empty array for the result with the appropriate size - result = np.zeros((m2 * m * row * n2 * n * col), dtype=np.int32) - - # Loop over the tiles - for mm2 in range(m2): - for nn2 in range(n2): - for kk2 in range(k2): - # Create submatrices for this tile - sub_a = a[ - (mm2 * k2 + kk2) - * m - * k - * row - * size: (mm2 * k2 + kk2 + 1) - * m - * k - * row - * size - ] - sub_b = b[ - (nn2 * k2 + kk2) - * n - * k - * size - * col: (nn2 * k2 + kk2 + 1) - * n - * k - * size - * col - ] - sub_c = c[ - (mm2 * n2 + nn2) - * m - * row - * n - * col: (mm2 * n2 + nn2 + 1) - * m - * row - * n - * col - ] - - # Perform block GEMM on the submatrices - sub_d = block_gemm_golden_model( - m, - k, - n, - row, - size, - col, - sub_a, - sub_b, - subtraction_a, - subtraction_b, - sub_c, - ) - # Accumulate the result into the final result matrix at the correct position - result[ - (mm2 * n2 + nn2) - * m - * row - * n - * col: (mm2 * n2 + nn2 + 1) - * m - * row - * n - * col - ] += sub_d - - return result - - -# Golden model function for reshuffling data with specified parameters. It applies -# strided layout mapping to the input data and returns the reshuffled data array. -def data_reshuffler_golden_model( - tempLoop0, - tempLoop1, - spatial_len_0, - spatial_len_1, - tempStride0, - tempStride1, - spatialStride0, - spatialStride1, - data, - int32=False, -): - # abstract illusion: k innermost loop, m second innermost loop, - # K third innermost loop, M outermost loop - - # total loop bounds = spatial loop bounds * temporal loop bounds - K = tempLoop0 * spatial_len_0 - M = tempLoop1 * spatial_len_1 - - # loop bounds settings - matrix_size = {"K": K, "M": M, "k": spatial_len_0, "m": spatial_len_1} - - # stride settings - strides = { - "M": tempStride1, - "K": tempStride0, - "m": spatialStride1, - "k": spatialStride0, - } - - if int32: - result_array = np.zeros((matrix_size["M"] * matrix_size["K"]), np.int32) - else: - result_array = np.zeros((matrix_size["M"] * matrix_size["K"]), np.int8) - - # apply strided layout mapping for the golden model of data reshuffler - for M in range(matrix_size["M"] // matrix_size["m"]): - for K in range(matrix_size["K"] // matrix_size["k"]): - for m in range(matrix_size["m"]): - for k in range(matrix_size["k"]): - result_array[ - # output address calculation with coutinued increment - matrix_size["K"] - // matrix_size["k"] - * matrix_size["k"] - * matrix_size["m"] - * M - + matrix_size["k"] * matrix_size["m"] * K - + m * matrix_size["k"] - + k - ] = data[ - # input address calculation with - # strided layout mapping eqaution - strides["M"] * M - + strides["K"] * K - + strides["m"] * m - + strides["k"] * k - ] - - return result_array.ravel() - - -# Golden model function for SIMD postprocessing of data. It performs operations such as -# zero point subtraction, multiplication, right shift, double rounding, and clipping. -def postprocessing_simd_golden_model( - data_in, - input_zp_i, - output_zp_i, - shift_i, - max_int_i, - min_int_i, - double_round_i, - multiplier_i, -): - - # Step 1: Subtract input zero point - var = data_in - input_zp_i - - # Step 2: Multiply with the multiplier avoiding overflow - var = np.int64(var) * np.int64(multiplier_i) - - # Step 3: Right shift - var = np.int32(var >> (shift_i - 1)) - - # Step 4: Apply double rounding if necessary - if double_round_i: - var = np.where(var >= 0, var + 1, var - 1) - - # Step 5: Final right shift - var = var >> 1 - - # Step 6: Add output zero point - var = var + output_zp_i - - # Step 7: Clip the values to be within min and max integer range - var = np.clip(var, min_int_i, max_int_i) - - return var - - -def max_pooling( - input_tensor, - pool_size_w, - pool_size_h, - stride_w, - stride_h, - padding_w, - padding_h, - mode="HWC", -): - - # if mode == "HWC", C8 is 1, C = realCin - # if mode != "HWC", C8 is realCin/8, C = 8 - C8, H, W, C = input_tensor.shape - if mode != "HWC": - assert input_tensor.shape[3] == 8 and C == 8 - elif mode == "HWC": - assert input_tensor.shape[0] == 1 and C8 == 1 - - out_width = (W + 2 * padding_w - pool_size_w) // stride_w + 1 - out_height = (H + 2 * padding_h - pool_size_h) // stride_h + 1 - - input_padded = np.pad( - input_tensor, - ((0, 0), (padding_h, padding_h), (padding_w, padding_w), (0, 0)), - mode="constant", - constant_values=0, - ) - - pooled_tensor = np.zeros((C8, out_height, out_width, C), dtype=np.int8) - - for c in range(C8): - for i in range(out_height): - for j in range(out_width): - for k in range(C): - h_start = i * stride_h - h_end = h_start + pool_size_h - w_start = j * stride_w - w_end = w_start + pool_size_w - pooled_tensor[c, i, j, k] = np.max( - input_padded[c, h_start:h_end, w_start:w_end, k] - ) - - return pooled_tensor