diff --git a/target/fpga/sw/send_uart.sh b/target/fpga/sw/send_uart.sh index 902660cb6..177ef9212 100755 --- a/target/fpga/sw/send_uart.sh +++ b/target/fpga/sw/send_uart.sh @@ -11,10 +11,10 @@ else FILE=$1 fi - stty -F /dev/ttyUSB1 cs8 1000000 ignbrk -brkint -imaxbel -opost -onlcr -isig -icanon -iexten -echo -echoe -echok -echoctl -echoke noflsh -ixon crtscts + stty -F /dev/ttyUSB2 cs8 1000000 ignbrk -brkint -imaxbel -opost -onlcr -isig -icanon -iexten -echo -echoe -echok -echoctl -echoke noflsh -ixon crtscts - echo -n 2 > /dev/ttyUSB1 + echo -n 2 > /dev/ttyUSB2 - sx -k "$FILE" < /dev/ttyUSB1 > /dev/ttyUSB1 + sx -k "$FILE" < /dev/ttyUSB2 > /dev/ttyUSB2 fi diff --git a/target/sim/sw/device/apps/snax/snax-test-integration/Makefile b/target/sim/sw/device/apps/snax/snax-test-integration/Makefile index 32d34657a..283b63074 100644 --- a/target/sim/sw/device/apps/snax/snax-test-integration/Makefile +++ b/target/sim/sw/device/apps/snax/snax-test-integration/Makefile @@ -14,4 +14,4 @@ include ./data/Makefile include ../../common.mk -$(DEP): $(DATA_H) \ No newline at end of file +$(DEP): $(DATA_H) diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/Makefile b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/Makefile new file mode 100644 index 000000000..ffd89b962 --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/Makefile @@ -0,0 +1,20 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Yunhao Deng + +APP = snax-xdma-maxpool + +INCDIRS += data +INCDIRS += INCDIRS += ../../../snax/xdma/include/ + +# Add binary to final build +RISCV_LDFLAGS += ../../../snax/xdma/build/snax-xdma-lib.o + +SRCS = src/snax-xdma-maxpool.c + +include ./data/Makefile +include ../../common.mk + +$(DEP): $(DATA_H) diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/Makefile b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/Makefile new file mode 100644 index 000000000..18006cbf7 --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/Makefile @@ -0,0 +1,23 @@ +# Copyright 2023 KU Leuven. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Xiaoling Yi + +# Usage of absolute paths is required to externally include this Makefile +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +DATA_DIR := $(realpath $(MK_DIR)) + +DATA_CFG ?= $(DATA_DIR)/params.hjson + +DATA_H = $(DATA_DIR)/data.h + +$(DATA_H): $(DATA_DIR)/datagen.py $(DATA_CFG) + $< -c $(DATA_CFG) > $@ + +.PHONY: clean-data clean + +clean-data: + rm -f $(DATA_H) + +clean: clean-data diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py new file mode 100755 index 000000000..fccf81ebd --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 + +# Copyright 2024 KU Leuven. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Fanchen Kong + +import numpy as np +import argparse +import pathlib +import hjson +import sys +import os +import subprocess + +# Add data utility path +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/")) +from data_utils import format_scalar_definition, format_vector_definition # noqa E402 + +bender_command = subprocess.run(['bender', 'path', 'snitch_cluster'], + capture_output=True, text=True) +snax_utils_path = bender_command.stdout.strip() + +sys.path.append(snax_utils_path + "/util/sim/") + +from snax_utils import max_pooling # noqa E402 +np.random.seed(42) + + +# Add stdint.h header +def emit_header_file(**kwargs): + emit_str = "#include \n\n" + emit_str += "#include \n\n" + emit_str += emit_data(**kwargs) + return emit_str + + +def emit_data(**kwargs): + MIN = -128 + MAX = 127 + + data_str = "" + data_str += format_scalar_definition("int8_t", + "H", + kwargs["H"]) + "\n" + data_str += format_scalar_definition("int8_t", + "W", + kwargs["W"]) + "\n" + data_str += format_scalar_definition("int8_t", + "Cin", + kwargs["Cin"]) + "\n" + data_str += format_scalar_definition("int8_t", + "Kh", + kwargs["Kh"]) + "\n" + data_str += format_scalar_definition("int8_t", "Kw", kwargs["Kw"]) + "\n" + data_str += format_scalar_definition("int8_t", + "pad_h", kwargs["pad_h"]) + "\n" + data_str += format_scalar_definition("int8_t", + "pad_w", kwargs["pad_w"]) + "\n" + data_str += format_scalar_definition("int8_t", + "stride_h", kwargs["stride_h"]) + "\n" + data_str += format_scalar_definition("int8_t", + "stride_w", kwargs["stride_w"]) + "\n" + padded_h = kwargs["H"] + 2 * kwargs["pad_h"] + padded_w = kwargs["W"] + 2 * kwargs["pad_w"] + out_h = (kwargs["H"] + 2 * kwargs["pad_h"] - + kwargs["Kh"]) // kwargs["stride_h"] + 1 + out_w = (kwargs["W"] + 2 * kwargs["pad_w"] - + kwargs["Kw"]) // kwargs["stride_w"] + 1 + + data_str += format_scalar_definition("int8_t", "out_H", out_h) + "\n" + data_str += format_scalar_definition("int8_t", "out_W", out_w) + "\n" + data_str += format_scalar_definition("int8_t", "padded_H", padded_h) + "\n" + data_str += format_scalar_definition("int8_t", "padded_W", padded_w) + "\n" + + # Generating random input data vector + data_in = np.random.randint( + MIN, MAX, (1, kwargs["H"], kwargs["W"], kwargs["Cin"]) + ) + padded_data_in = np.pad( + data_in, + ( + (0, 0), + (kwargs["pad_h"], kwargs["pad_h"]), + (kwargs["pad_w"], kwargs["pad_w"]), + (0, 0), + ), + "constant", + ) + # Generating golden data + c_golden = max_pooling( + data_in, + kwargs["Kw"], + kwargs["Kh"], + kwargs["stride_w"], + kwargs["stride_h"], + kwargs["pad_w"], + kwargs["pad_h"], + "HWC", + ) + data_str += format_vector_definition("int8_t", + "padded_data_in", + padded_data_in.reshape(-1)) + "\n" + data_str += format_vector_definition("int8_t", + "golden_data_out", + c_golden.reshape(-1)) + "\n" + + return data_str + + +def main(): + # Parsing cmd args + parser = argparse.ArgumentParser(description="Generating data for kernels") + parser.add_argument( + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help="Select param config file kernel", + ) + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + + # Emit header file + print(emit_header_file(**param)) + + +if __name__ == "__main__": + main() diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson new file mode 100644 index 000000000..59df2f731 --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson @@ -0,0 +1,16 @@ +// Copyright 2023 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Fanchen Kong +{ + H: 32, + W: 32, + Cin: 8, + Kh: 3, + Kw: 3, + pad_h: 1, + pad_w: 1, + stride_h: 1, + stride_w: 1 +} \ No newline at end of file diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c new file mode 100644 index 000000000..5ca5743e4 --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c @@ -0,0 +1,106 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Fanchen Kong + +#include "data.h" +#include "snax-xdma-lib.h" +#include "snrt.h" + +int main() { + // Set err value for checking + int err = 0; + // Obtain the start address of the TCDM memory + uint32_t dma_load_input_start; + uint32_t dma_load_input_end; + uint32_t *tcdm_baseaddress = (uint32_t *)snrt_l1_next(); + // Put the input at the starting of tcdm + uint8_t *tcdm_in = tcdm_baseaddress; + // Put the output at the middle of tcdm + uint8_t *tcdm_out = tcdm_in + 0x10000 * sizeof(uint8_t); + + if (snrt_is_dm_core()) { + // The xdma core is the last compute core in the cluster + uint32_t sstride_src[1] = {8}; + uint32_t sstride_dst[1] = {8}; + uint32_t tstride_src[2] = {8, 512}; + uint32_t tbound_src[2] = {3, 3}; + + // First we need to transfer the input data from L3->TCDM + // Here we use the 2d iDMA transfer + dma_load_input_start = snrt_mcycle(); + snrt_dma_start_2d( + tcdm_in, padded_data_in, padded_W * Cin * sizeof(uint8_t), + 512 * sizeof(uint8_t), padded_W * Cin * sizeof(uint8_t), + padded_H * sizeof(uint8_t)); + snrt_dma_wait_all(); + dma_load_input_end = snrt_mcycle(); + + // --------------------- Configure the Ext --------------------- // + + // There are three extensions in xdma + // VerilogMemset, Maxpool, Transposer + // 0 , 1 , 2 + // We want to only use Maxpool + // Hence we need to disable the 0 and 2 + // and we set the maxpool csr to 9 since we need 3x3 pooling + if (xdma_disable_dst_ext(0) != 0) { + printf("Error in disabling xdma extension 0 \r\n"); + err++; + } else { + printf("The xdma extension 0 is disabled \r\n"); + } + + uint32_t ext_param_maxpool_size[1] = {9}; + if (xdma_enable_dst_ext(1, ext_param_maxpool_size) != 0) { + printf("Error in enabling xdma extension 1 \r\n"); + err++; + } else { + printf("The xdma extension 1 is enabled \r\n"); + } + + if (xdma_disable_dst_ext(2) != 0) { + printf("Error in disabling xdma extension 2 \r\n"); + err++; + } else { + printf("The xdma extension 2 is disabled \r\n"); + } + + // --------------------- Configure the AGU --------------------- // + uint8_t *local_src_pointer; + uint8_t *local_dst_pointer; + int task_id; + for (int i = 0; i < out_H; i++) { + for (int j = 0; j < out_W / 8; j++) { + local_src_pointer = tcdm_in + j * 64 + i * 512; + local_dst_pointer = tcdm_out + j * 64 + i * 256; + if (xdma_memcpy_nd(local_src_pointer, local_dst_pointer, + sstride_src, sstride_dst, 2, tstride_src, + tbound_src, 0, NULL, NULL, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF) != 0) { + printf("Error in xdma agu configuration \r\n"); + err++; + } else { + printf("The xdma agu is configured \r\n"); + } + int task_id = xdma_start(); + xdma_wait(task_id); + printf("i = %d, j = %d is done \r\n", i, j); + } + } + + // --------------------- Checking the Results --------------------- // + printf("Checking the results \r\n"); + for (int i = 0; i < out_H * out_W * Cin; i++) { + if ((int8_t)tcdm_out[i] != golden_data_out[i]) { + printf("The maxpool is incorrect! \r\n"); + printf("tcdm_out[%d]=%d, golden_data_out[%d]=%d", i, + (int8_t)tcdm_out[i], i, golden_data_out[i]); + } + } + printf("Checking is done. All values are right \r\n"); + } + + return 0; +} \ No newline at end of file diff --git a/target/sim/sw/device/apps/snax/snax-xdma-memset/Makefile b/target/sim/sw/device/apps/snax/snax-xdma-memset/Makefile new file mode 100644 index 000000000..e26811acf --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-xdma-memset/Makefile @@ -0,0 +1,16 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Yunhao Deng + +APP = snax-xdma-memset + +INCDIRS += ../../../snax/xdma/include/ + +# Add binary to final build +RISCV_LDFLAGS += ../../../snax/xdma/build/snax-xdma-lib.o + +SRCS = src/snax-xdma-memset.c + +include ../../common.mk diff --git a/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c b/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c new file mode 100644 index 000000000..83d246edd --- /dev/null +++ b/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c @@ -0,0 +1,200 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Yunhao Deng + +#include +#include "snax-xdma-lib.h" +#include "snrt.h" + +int main() { + // Set err value for checking + int err = 0; + + // Obtain the start address of the TCDM memory + uint8_t *tcdm_baseaddress = (uint8_t *)snrt_l1_next(); + uint8_t *tcdm_0 = tcdm_baseaddress; + uint8_t *tcdm_16 = tcdm_baseaddress + 0x4000 * sizeof(uint8_t); + uint8_t *tcdm_32 = tcdm_baseaddress + 0x8000 * sizeof(uint8_t); + uint8_t *tcdm_48 = tcdm_baseaddress + 0xc000 * sizeof(uint8_t); + uint8_t *tcdm_64 = tcdm_baseaddress + 0x10000 * sizeof(uint8_t); + uint8_t *tcdm_80 = tcdm_baseaddress + 0x14000 * sizeof(uint8_t); + uint8_t *tcdm_96 = tcdm_baseaddress + 0x18000 * sizeof(uint8_t); + uint8_t *tcdm_112 = tcdm_baseaddress + 0x1c000 * sizeof(uint8_t); + + // Using xdma core only + if (snrt_is_dm_core()) { + // The xdma core is the last compute core in the cluster + + // Test 1: Setting the 0-16KB region to 0xFF + printf("Core %d is xdma core. \r\n", snrt_cluster_core_idx()); + printf("Test 1: Setting the 0-16KB region to 0xFF \r\n"); + if (xdma_memcpy_1d(tcdm_0, tcdm_0, 0x4000 * sizeof(uint8_t)) != 0) { + printf("Error in xdma agu configuration \r\n"); + err++; + } else { + printf("The xdma agu is configured \r\n"); + } + + uint32_t ext_param_t1[1] = {0xFFFFFFFF}; + if (xdma_enable_dst_ext(0, ext_param_t1) != 0) { + printf("Error in enabling xdma extension 0 \r\n"); + err++; + } else { + printf("The xdma extension 0 is enabled \r\n"); + } + + if (xdma_disable_dst_ext(1) != 0) { + printf("Error in disabling xdma extension 1 \r\n"); + err++; + } else { + printf("The xdma extension 1 is disabled \r\n"); + } + + if (xdma_disable_dst_ext(2) != 0) { + printf("Error in disabling xdma extension 2 \r\n"); + err++; + } else { + printf("The xdma extension 2 is disabled \r\n"); + } + + if (err != 0) { + return err; + } + + int task_id = xdma_start(); + printf( + "The xdma is started, setting memory region to 0xFF. The task id " + "is %d \r\n", + task_id); + xdma_wait(task_id); + + printf("The xdma is finished \r\n"); + // Check the data + for (int i = 0; i < 0x4000; i++) { + if (tcdm_0[i] != 0xFF) { + printf("The memset of 0KB - 16KB is not correct \r\n"); + return -1; + } + } + printf("The memset of 0KB - 16KB is correct \r\n"); + + // Test 2: Setting the 4K-12K region back to 0. Instead of using the + // memset, this test do this by disabling all the readers. + printf( + "Test 2: Setting the 4K-12K region back to 0 by disabling all " + "reader channels \r\n"); + uint32_t sstride_src_t2[1] = {0}; + uint32_t tstride_src_t2[1] = {64}; + uint32_t sstride_dst_t2[1] = {8}; + uint32_t tstride_dst_t2[1] = {64}; + uint32_t tbound_src_t2[1] = {128}; + uint32_t tbound_dst_t2[1] = {128}; + + if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t), + sstride_src_t2, sstride_dst_t2, 1, tstride_src_t2, + tbound_src_t2, 1, tstride_dst_t2, tbound_dst_t2, 0x0, + 0xffffffff, 0xffffffff) != 0) { + printf("Error in xdma agu configuration \r\n"); + err++; + } else { + printf("The xdma agu is configured \r\n"); + } + + if (xdma_disable_dst_ext(0) != 0) { + printf("Error in enabling xdma extension 0 \r\n"); + err++; + } else { + printf("The xdma extension 0 is disabled \r\n"); + } + + if (err != 0) { + return err; + } + + task_id = xdma_start(); + printf( + "The xdma is started, setting memory region to 0x00. The task id " + "is %d \r\n", + task_id); + xdma_wait(task_id); + + printf("The xdma is finished \r\n"); + // Check the data + for (int i = 0; i < 0x1000; i++) { + if (tcdm_0[i] != 0xFF) { + printf("Error in memset (region 0) \r\n"); + return -1; + } + } + for (int i = 0x1000; i < 0x3000; i++) { + if (tcdm_0[i] != 0x00) { + printf("The memset is incorrect (region 1) \r\n"); + return -1; + } + } + for (int i = 0x3000; i < 0x4000; i++) { + if (tcdm_0[i] != 0xFF) { + printf("The memset is incorrect (region 2) \r\n"); + return -1; + } + } + printf("The memset of 4KB - 12KB is correct \r\n"); + + // Test 3: Setting the 4-12KB region to 0x0000000000000001 (uint64_t 1) + // This test is to validate the byte mask by shielding all other bits, + // so only LSB 8 bits are set. + printf( + "Test 3: Setting the 4-12KB region to 0x0000000000000001 (uint64_t " + "1) \r\n"); + uint32_t sstride_src_t3[1] = {8}; + uint32_t sstride_dst_t3[1] = {8}; + uint32_t tstride_src_t3[1] = {64}; + uint32_t tstride_dst_t3[1] = {64}; + uint32_t tbound_src_t3[1] = {128}; + uint32_t tbound_dst_t3[1] = {128}; + if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t), + sstride_src_t3, sstride_dst_t3, 1, tstride_src_t3, + tbound_src_t3, 1, tstride_dst_t3, tbound_dst_t3, + 0xffffffff, 0xffffffff, 0x1) != 0) { + printf("Error in xdma agu configuration \r\n"); + err++; + } else { + printf("The xdma agu is configured \r\n"); + } + + uint32_t ext_param_t3[1] = {0x1}; + if (xdma_enable_dst_ext(0, ext_param_t3) != 0) { + printf("Error in enabling xdma extension 0 \r\n"); + err++; + } else { + printf("The xdma extension 0 is disabled \r\n"); + } + + if (err != 0) { + return err; + } + + task_id = xdma_start(); + printf( + "The xdma is started, setting memory region to 0x0000000000000001 " + "(uint64_t 1). The task id is %d \r\n", + task_id); + xdma_wait(task_id); + + printf("The xdma is finished \r\n"); + uint64_t *result_t3 = (uint64_t *)(tcdm_0 + 0x1000 * sizeof(uint8_t)); + for (int i = 0; i < 0x2000 / 8; i++) { + if (result_t3[i] != 1) { + printf("Error in memset (region 0) \r\n"); + return -1; + } + } + printf("The memset of 4KB - 12KB is correct \r\n"); + } else { + printf("Core %d is not xdma core. \r\n", snrt_cluster_core_idx()); + } + + return 0; +} diff --git a/target/sim/sw/device/snax/xdma/Makefile b/target/sim/sw/device/snax/xdma/Makefile new file mode 100644 index 000000000..cee3a9879 --- /dev/null +++ b/target/sim/sw/device/snax/xdma/Makefile @@ -0,0 +1,34 @@ +# Copyright 2023 KU Leuven. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Xiaoling Yi + +# Usage of absolute paths is required to externally include +# this Makefile from multiple different locations + +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +include $(MK_DIR)/../common.mk + +############ +## Outputs # +############ + +OBJS = $(BUILDDIR)/snax-xdma-lib.o +ALL_OUTPUTS = $(OBJS) +INCDIRS += $(abspath include) + +########## +## Rules # +########## + +.PHONY: all +all: $(ALL_OUTPUTS) + +.PHONY: clean +clean: + rm -rf $(BUILDDIR) + +$(BUILDDIR): + mkdir -p $@ + diff --git a/target/sim/sw/device/snax/xdma/include/snax-xdma-csr-addr.h b/target/sim/sw/device/snax/xdma/include/snax-xdma-csr-addr.h new file mode 100644 index 000000000..c68f3c2e9 --- /dev/null +++ b/target/sim/sw/device/snax/xdma/include/snax-xdma-csr-addr.h @@ -0,0 +1,45 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Yunhao Deng + +// This file is generated by Chisel in hw/chisel, do not modify it manually + +#define XDMA_BASE_ADDR 960 +#define XDMA_WIDTH 64 +#define XDMA_SPATIAL_CHAN 8 +#define XDMA_SRC_ADDR_PTR_LSB XDMA_BASE_ADDR +#define XDMA_SRC_ADDR_PTR_MSB XDMA_SRC_ADDR_PTR_LSB + 1 +#define XDMA_SRC_SPATIAL_DIM 1 +#define XDMA_SRC_TEMP_DIM 6 +#define XDMA_SRC_SPATIAL_STRIDE_PTR XDMA_SRC_ADDR_PTR_MSB + 1 +#define XDMA_SRC_TEMP_BOUND_PTR XDMA_SRC_SPATIAL_STRIDE_PTR + XDMA_SRC_SPATIAL_DIM +#define XDMA_SRC_TEMP_STRIDE_PTR XDMA_SRC_TEMP_BOUND_PTR + XDMA_SRC_TEMP_DIM +#define XDMA_SRC_ENABLED_CHAN_PTR XDMA_SRC_TEMP_STRIDE_PTR + XDMA_SRC_TEMP_DIM +#define XDMA_SRC_BYPASS_PTR XDMA_SRC_ENABLED_CHAN_PTR + 1 +#define XDMA_SRC_EXT_NUM 0 +#define XDMA_SRC_EXT_CSR_PTR XDMA_SRC_BYPASS_PTR + 0 +#define XDMA_SRC_EXT_CSR_NUM 0 +#define XDMA_SRC_EXT_CUSTOM_CSR_NUM \ + { } + +#define XDMA_DST_ADDR_PTR_LSB XDMA_SRC_EXT_CSR_PTR + XDMA_SRC_EXT_CSR_NUM +#define XDMA_DST_ADDR_PTR_MSB XDMA_DST_ADDR_PTR_LSB + 1 + +#define XDMA_DST_SPATIAL_DIM 1 +#define XDMA_DST_TEMP_DIM 6 +#define XDMA_DST_SPATIAL_STRIDE_PTR XDMA_DST_ADDR_PTR_MSB + 1 +#define XDMA_DST_TEMP_BOUND_PTR XDMA_DST_SPATIAL_STRIDE_PTR + XDMA_DST_SPATIAL_DIM +#define XDMA_DST_TEMP_STRIDE_PTR XDMA_DST_TEMP_BOUND_PTR + XDMA_DST_TEMP_DIM +#define XDMA_DST_ENABLED_CHAN_PTR XDMA_DST_TEMP_STRIDE_PTR + XDMA_DST_TEMP_DIM +#define XDMA_DST_ENABLED_BYTE_PTR XDMA_DST_ENABLED_CHAN_PTR + 1 +#define XDMA_DST_BYPASS_PTR XDMA_DST_ENABLED_BYTE_PTR + 1 +#define XDMA_DST_EXT_NUM 3 +#define XDMA_DST_EXT_CSR_PTR XDMA_DST_BYPASS_PTR + 1 +#define XDMA_DST_EXT_CSR_NUM 2 +#define XDMA_DST_EXT_CUSTOM_CSR_NUM \ + { 1, 1, 0 } +#define XDMA_START_PTR XDMA_DST_EXT_CSR_PTR + XDMA_DST_EXT_CSR_NUM +#define XDMA_COMMIT_TASK_PTR XDMA_START_PTR + 1 +#define XDMA_FINISH_TASK_PTR XDMA_COMMIT_TASK_PTR + 1 diff --git a/target/sim/sw/device/snax/xdma/include/snax-xdma-lib.h b/target/sim/sw/device/snax/xdma/include/snax-xdma-lib.h new file mode 100644 index 000000000..97d1836b8 --- /dev/null +++ b/target/sim/sw/device/snax/xdma/include/snax-xdma-lib.h @@ -0,0 +1,32 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Yunhao Deng + +#pragma once +#include +#include "stdint.h" +// Define the CSR address of xdma, should be generated by scala +#include "snax-xdma-csr-addr.h" + +// Set CSR for xdma +int32_t xdma_memcpy_nd(uint8_t* src, uint8_t* dst, uint32_t* spatial_stride_src, + uint32_t* spatial_stride_dst, uint32_t temp_dim_src, + uint32_t* temp_stride_src, uint32_t* temp_bound_src, + uint32_t temp_dim_dst, uint32_t* temp_stride_dst, + uint32_t* temp_bound_dst, uint32_t enabled_chan_src, + uint32_t enabled_chan_dst, uint32_t enabled_byte_dst); +int32_t xdma_memcpy_1d(uint8_t* src, uint8_t* dst, uint32_t size); +int32_t xdma_enable_src_ext(uint8_t ext, uint32_t* csr_value); +int32_t xdma_disable_src_ext(uint8_t ext); +int32_t xdma_enable_dst_ext(uint8_t ext, uint32_t* csr_value); +int32_t xdma_disable_dst_ext(uint8_t ext); + +// Start xdma +uint32_t xdma_start(); + +// Check if xdma is finished +bool xdma_is_finished(uint32_t task_id); + +void xdma_wait(uint32_t task_id); diff --git a/target/sim/sw/device/snax/xdma/src/snax-xdma-lib.c b/target/sim/sw/device/snax/xdma/src/snax-xdma-lib.c new file mode 100644 index 000000000..52e11df67 --- /dev/null +++ b/target/sim/sw/device/snax/xdma/src/snax-xdma-lib.c @@ -0,0 +1,182 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Yunhao Deng + +#include "snax-xdma-lib.h" +#include +#include "snrt.h" +#include "stdint.h" + +#define XDMA_DEBUG +#ifdef XDMA_DEBUG +#define XDMA_DEBUG_PRINT(...) printf(__VA_ARGS__) +#else +#define XDMA_DEBUG_PRINT(...) +#endif + +int32_t xdma_memcpy_nd(uint8_t* src, uint8_t* dst, uint32_t* spatial_stride_src, + uint32_t* spatial_stride_dst, uint32_t temp_dim_src, + uint32_t* temp_stride_src, uint32_t* temp_bound_src, + uint32_t temp_dim_dst, uint32_t* temp_stride_dst, + uint32_t* temp_bound_dst, uint32_t enabled_chan_src, + uint32_t enabled_chan_dst, uint32_t enabled_byte_dst) { + csrw_ss(XDMA_SRC_ADDR_PTR_LSB, (uint32_t)(uint64_t)src); + csrw_ss(XDMA_SRC_ADDR_PTR_MSB, (uint32_t)((uint64_t)src >> 32)); + + csrw_ss(XDMA_DST_ADDR_PTR_LSB, (uint32_t)(uint64_t)dst); + csrw_ss(XDMA_DST_ADDR_PTR_MSB, (uint32_t)((uint64_t)dst >> 32)); + // Rule check + // The enabled spatial bound for input should be equal to the enabled + // Src frame count and dst frame count should be equal + uint32_t src_size = 1; + if (temp_dim_src > 0) { + for (uint32_t i = 0; i < temp_dim_src; i++) { + src_size *= temp_bound_src[i]; + } + } + uint32_t dst_size = 1; + if (temp_dim_dst > 0) { + for (uint32_t i = 0; i < temp_dim_dst; i++) { + dst_size *= temp_bound_dst[i]; + } + } + if (src_size != dst_size) { + XDMA_DEBUG_PRINT("src loop and dst loop is not equal\n"); + // return -3; + } + // Spatial Stride 0 to XDMA_SRC_SPATIAL_DIM at src + for (uint32_t i = 0; i < XDMA_SRC_SPATIAL_DIM; i++) { + csrw_ss(XDMA_SRC_SPATIAL_STRIDE_PTR + i, spatial_stride_src[i]); + } + // Spatial Stride 0 to XDMA_DST_SPATIAL_DIM at dst + for (uint32_t i = 0; i < XDMA_DST_SPATIAL_DIM; i++) { + csrw_ss(XDMA_DST_SPATIAL_STRIDE_PTR + i, spatial_stride_dst[i]); + } + // Temporal Dimension 0 to n at src + for (uint32_t i = 0; i < temp_dim_src; i++) { + if (i >= XDMA_SRC_TEMP_DIM) { + XDMA_DEBUG_PRINT("Source dimension is too high for xdma\n"); + return -4; + } + csrw_ss(XDMA_SRC_TEMP_BOUND_PTR + i, temp_bound_src[i]); + csrw_ss(XDMA_SRC_TEMP_STRIDE_PTR + i, temp_stride_src[i]); + } + // Dimension n to MAX at src + for (uint32_t i = temp_dim_src; i < XDMA_SRC_TEMP_DIM; i++) { + csrw_ss(XDMA_SRC_TEMP_BOUND_PTR + i, 1); + csrw_ss(XDMA_SRC_TEMP_STRIDE_PTR + i, 0); + } + // Temporal Dimension 0 to n at dst + for (uint32_t i = 0; i < temp_dim_dst; i++) { + if (i >= XDMA_DST_TEMP_DIM) { + XDMA_DEBUG_PRINT("Destination dimension is too high for xdma\n"); + return -4; + } + csrw_ss(XDMA_DST_TEMP_BOUND_PTR + i, temp_bound_dst[i]); + csrw_ss(XDMA_DST_TEMP_STRIDE_PTR + i, temp_stride_dst[i]); + } + // Dimension n to MAX at dst + for (uint32_t i = temp_dim_dst; i < XDMA_DST_TEMP_DIM; i++) { + csrw_ss(XDMA_DST_TEMP_BOUND_PTR + i, 1); + csrw_ss(XDMA_DST_TEMP_STRIDE_PTR + i, 0); + } + // Enabled channel at src + csrw_ss(XDMA_SRC_ENABLED_CHAN_PTR, enabled_chan_src); + // Enabled channel at dst + csrw_ss(XDMA_DST_ENABLED_CHAN_PTR, enabled_chan_dst); + // Enabled byte at dst + csrw_ss(XDMA_DST_ENABLED_BYTE_PTR, enabled_byte_dst); + return 0; +} + +int32_t xdma_memcpy_1d(uint8_t* src, uint8_t* dst, uint32_t size) { + if (size % XDMA_WIDTH != 0) { + XDMA_DEBUG_PRINT("Size is not multiple of XDMA_WIDTH\n"); + return -1; + } + uint32_t spatial_stride[1] = {XDMA_WIDTH / XDMA_SPATIAL_CHAN}; + uint32_t temporal_stride[1] = {XDMA_WIDTH}; + uint32_t temporal_bound[1] = {size / XDMA_WIDTH}; + uint32_t bound[2] = {XDMA_SPATIAL_CHAN, size / XDMA_WIDTH}; + return xdma_memcpy_nd(src, dst, spatial_stride, spatial_stride, 2, + temporal_stride, temporal_bound, 2, temporal_stride, + temporal_bound, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); +} + +// xdma extension interface +int32_t xdma_enable_src_ext(uint8_t ext, uint32_t* csr_value) { + if (ext >= XDMA_SRC_EXT_NUM) { + return -1; + } + uint8_t custom_csr_list[XDMA_SRC_EXT_NUM] = XDMA_SRC_EXT_CUSTOM_CSR_NUM; + uint32_t csr_offset = XDMA_SRC_EXT_CSR_PTR; + for (uint8_t i = 0; i < ext; i++) { + csr_offset += custom_csr_list[i]; + } + + // Not bypass the xdma extension -> set the corresponding CSR bit to 0 + csrw_ss(XDMA_SRC_BYPASS_PTR, csrr_ss(XDMA_SRC_BYPASS_PTR) & ~(1 << ext)); + + for (uint8_t i = 0; i < custom_csr_list[ext]; i++) { + csrw_ss(csr_offset + i, csr_value[i]); + } + return 0; +} +int32_t xdma_enable_dst_ext(uint8_t ext, uint32_t* csr_value) { + if (ext >= XDMA_DST_EXT_NUM) { + return -1; + } + uint8_t custom_csr_list[XDMA_DST_EXT_NUM] = XDMA_DST_EXT_CUSTOM_CSR_NUM; + uint32_t csr_offset = XDMA_DST_EXT_CSR_PTR; + for (uint8_t i = 0; i < ext; i++) { + csr_offset += custom_csr_list[i]; + } + + // Not bypass the xdma extension -> set the corresponding CSR bit to 0 + csrw_ss(XDMA_DST_BYPASS_PTR, csrr_ss(XDMA_DST_BYPASS_PTR) & ~(1 << ext)); + for (uint8_t i = 0; i < custom_csr_list[ext]; i++) { + csrw_ss(csr_offset + i, csr_value[i]); + } + return 0; +} + +int32_t xdma_disable_src_ext(uint8_t ext) { + if (ext >= XDMA_SRC_EXT_NUM) { + return 0; + } + // Bypass the xdma extension -> set the corresponding CSR bit to 1 + csrw_ss(XDMA_SRC_BYPASS_PTR, csrr_ss(XDMA_SRC_BYPASS_PTR) | (1 << ext)); + return 0; +} + +int32_t xdma_disable_dst_ext(uint8_t ext) { + if (ext >= XDMA_DST_EXT_NUM) { + return 0; + } + // Bypass the xdma extension -> set the corresponding CSR bit to 1 + csrw_ss(XDMA_DST_BYPASS_PTR, csrr_ss(XDMA_DST_BYPASS_PTR) | (1 << ext)); + return 0; +} + +// Start xdma +uint32_t xdma_start() { + int ret = csrr_ss(XDMA_COMMIT_TASK_PTR); + csrw_ss(XDMA_START_PTR, 1); + while (csrr_ss(XDMA_COMMIT_TASK_PTR) == ret) { + // Wait for xdma to start + } + return csrr_ss(XDMA_COMMIT_TASK_PTR); +} + +// Check if xdma is finished +bool xdma_is_finished(uint32_t task_id) { + return csrr_ss(XDMA_FINISH_TASK_PTR) >= task_id; +} + +void xdma_wait(uint32_t task_id) { + while (!xdma_is_finished(task_id)) { + // Wait for xdma to finish + } +} diff --git a/target/sim/sw/host/apps/offload/Makefile b/target/sim/sw/host/apps/offload/Makefile index 5f673626e..7e74faef4 100644 --- a/target/sim/sw/host/apps/offload/Makefile +++ b/target/sim/sw/host/apps/offload/Makefile @@ -36,6 +36,11 @@ DEVICE_APPS += snax/snax-data-reshuffler DEVICE_APPS += snax/snax-streamer-gemm-conv DEVICE_APPS += snax/snax-streamer-gemm-conv-simd DEVICE_APPS += snax/snax-test-integration +DEVICE_APPS += snax/snax-hypercorex-test-csr +DEVICE_APPS += snax/snax-hypercorex-char-recog +DEVICE_APPS += snax/snax-xdma-maxpool +DEVICE_APPS += snax/snax-xdma-memset + # Dependencies INCDIRS += $(RUNTIME_DIR) INCDIRS += $(HOST_DIR)/../shared/platform/generated