diff --git a/sw/blas/blas.h b/sw/blas/blas.h index a7910d25e2..9207bf6f74 100644 --- a/sw/blas/blas.h +++ b/sw/blas/blas.h @@ -5,4 +5,5 @@ #pragma once #include "axpy/src/axpy.h" -#include "gemm/src/gemm.h" \ No newline at end of file +#include "gemm/src/gemm.h" +#include "dotp/src/dotp.h" diff --git a/sw/blas/dotp/Makefile b/sw/blas/dotp/Makefile new file mode 100644 index 0000000000..49ff75b883 --- /dev/null +++ b/sw/blas/dotp/Makefile @@ -0,0 +1,31 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Usage of absolute paths is required to externally include this Makefile +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +DATA_DIR := $(realpath $(MK_DIR)/data) +SRC_DIR := $(realpath $(MK_DIR)/src) + +DATA_CFG ?= $(DATA_DIR)/params.json +SECTION ?= + +APP ?= dotp +SRCS ?= $(realpath $(SRC_DIR)/main.c) +INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR) + +DATAGEN_PY = $(MK_DIR)/scripts/datagen.py +DATA_H ?= $(DATA_DIR)/data.h + +$(dir $(DATA_H)): + mkdir -p $@ + +$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H)) + $< -c $(DATA_CFG) --section="$(SECTION)" > $@ + +.PHONY: clean-data clean + +clean-data: + rm -f $(DATA_H) + +clean: clean-data diff --git a/sw/blas/dotp/data/params.json b/sw/blas/dotp/data/params.json new file mode 100644 index 0000000000..66dfcf770f --- /dev/null +++ b/sw/blas/dotp/data/params.json @@ -0,0 +1,7 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + n: 4096 +} diff --git a/sw/blas/dotp/scripts/datagen.py b/sw/blas/dotp/scripts/datagen.py new file mode 100755 index 0000000000..94a5e1be1c --- /dev/null +++ b/sw/blas/dotp/scripts/datagen.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import os +import sys + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +from data_utils import format_scalar_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper, DataGen # noqa: E402 + + +class AxpyDataGen(DataGen): + + MIN = -1000 + MAX = +1000 + # AXI splits bursts crossing 4KB address boundaries. To minimize + # the occurrence of these splits the data should be aligned to 4KB + BURST_ALIGNMENT = 4096 + + def golden_model(self, x, y): + return np.dot(x, y) + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + n = kwargs['n'] + x = np.random.uniform(self.MIN, self.MAX, n) + y = np.random.uniform(self.MIN, self.MAX, n) + g = self.golden_model(x, y) + + assert (n % 8) == 0, "n must be an integer multiple of the number of cores" + + header += [format_scalar_definition('const uint32_t', 'n', n)] + header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + header += [format_array_declaration('double', 'z', [n], alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + result_def = format_scalar_definition('double', 'g', g) + header += [format_ifdef_wrapper('BIST', result_def)] + header = '\n\n'.join(header) + + return header + + +if __name__ == '__main__': + sys.exit(AxpyDataGen().main()) diff --git a/sw/blas/dotp/scripts/verify.py b/sw/blas/dotp/scripts/verify.py new file mode 100755 index 0000000000..5ea42423e0 --- /dev/null +++ b/sw/blas/dotp/scripts/verify.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +import sys +from pathlib import Path +from datagen import AxpyDataGen + +sys.path.append(str(Path(__file__).parent / '../../../../util/sim/')) +from verif_utils import Verifier # noqa: E402 + + +class AxpyVerifier(Verifier): + + OUTPUT_UIDS = ['z'] + + def get_actual_results(self): + return self.get_output_from_symbol('z', 'double') + + def get_expected_results(self): + a = self.get_input_from_symbol('a', 'double') + x = self.get_input_from_symbol('x', 'double') + y = self.get_input_from_symbol('y', 'double') + return AxpyDataGen().golden_model(a, x, y) + + def check_results(self, *args): + return super().check_results(*args, rtol=1e-10) + + +if __name__ == "__main__": + sys.exit(AxpyVerifier().main()) diff --git a/sw/blas/dotp/src/dotp.h b/sw/blas/dotp/src/dotp.h new file mode 100644 index 0000000000..f2052e7397 --- /dev/null +++ b/sw/blas/dotp/src/dotp.h @@ -0,0 +1,88 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "snrt.h" + +inline void dotp_seq (uint32_t N, double *input_A, double *input_B, double *output) { + // Start of SSR region. + register volatile double ft0 asm("ft0"); + register volatile double ft1 asm("ft1"); + asm volatile("" + : "=f"(ft0), "=f"(ft1)); + + snrt_ssr_loop_1d(SNRT_SSR_DM0, N, sizeof(double)); + snrt_ssr_loop_1d(SNRT_SSR_DM1, N, sizeof(double)); + + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, input_A); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, input_B); + + register volatile double res_ssr asm("fs0") = 0; + + snrt_ssr_enable(); + + const register uint32_t Nm1 asm("t0") = N - 1; + asm volatile( + "frep.o %[n_frep], 1, 0, 0 \n" + "fmadd.d %0, ft0, ft1, %0" + : "=f"(res_ssr) /* output operands */ + : "f"(ft0), "f"(ft1), "0"(res_ssr), [n_frep]"r"(Nm1) /* input operands */ + :); + + // End of SSR region. + snrt_fpu_fence(); + snrt_ssr_disable(); + asm volatile("" + : + : "f"(ft0), "f"(ft1)); + output[0] = res_ssr; +} + +inline void dotp_seq_4_acc (uint32_t N, double *input_A, double *input_B, double *output) { + // Start of SSR region. + register volatile double ft0 asm("ft0"); + register volatile double ft1 asm("ft1"); + asm volatile("" + : "=f"(ft0), "=f"(ft1)); + + snrt_ssr_loop_1d(SNRT_SSR_DM0, N, sizeof(double)); + snrt_ssr_loop_1d(SNRT_SSR_DM1, N, sizeof(double)); + + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, input_A); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, input_B); + + register volatile double res_ssr_0 asm("fs0") = 0; + register volatile double res_ssr_1 asm("fs1") = 0; + register volatile double res_ssr_2 asm("fs2") = 0; + register volatile double res_ssr_3 asm("fs3") = 0; + + snrt_ssr_enable(); + + const register uint32_t Nm1 asm("t0") = (N >> 2) - 1; + asm volatile( + "frep.o %[n_frep], 4, 0, 0 \n" + "fmadd.d %0, ft0, ft1, %0 \n" + "fmadd.d %1, ft0, ft1, %1 \n" + "fmadd.d %2, ft0, ft1, %2 \n" + "fmadd.d %3, ft0, ft1, %3" + : "=f"(res_ssr_0), "=f"(res_ssr_1), "=f"(res_ssr_2), "=f"(res_ssr_3) /* output operands */ + : "f"(ft0), "f"(ft1), "0"(res_ssr_0), "1"(res_ssr_1), "2"(res_ssr_2), "3"(res_ssr_3), [n_frep]"r"(Nm1) /* input operands */ + :); + + // End of SSR region. + snrt_fpu_fence(); + snrt_ssr_disable(); + + asm volatile( + "fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_1] \n" + "fadd.d %[res_ssr_2], %[res_ssr_2], %[res_ssr_3] \n" + "fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_2]" + : [res_ssr_0]"=f"(res_ssr_0), [res_ssr_2]"=f"(res_ssr_2) /* output operands */ + : [res_ssr_1]"f"(res_ssr_1), [res_ssr_3]"f"(res_ssr_3) /* input operands */ + :); + + asm volatile("" + : + : "f"(ft0), "f"(ft1)); + output[0] = res_ssr_0; +} diff --git a/sw/blas/dotp/src/main.c b/sw/blas/dotp/src/main.c new file mode 100644 index 0000000000..c289952efa --- /dev/null +++ b/sw/blas/dotp/src/main.c @@ -0,0 +1,107 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "snrt.h" + +#include "printf.h" + +#define XSSR +#include "dotp.h" +#include "data.h" + +int main() { + double *local_x, *local_y, *local_z; + double *remote_x, *remote_y, *remote_z; + + volatile double sum; + + uint32_t start_cycle, end_cycle; + + // Calculate size and pointers for each cluster + uint32_t frac = n / snrt_cluster_num(); + uint32_t offset = frac * snrt_cluster_idx(); + remote_x = x + offset; + remote_y = y + offset; + remote_z = z + snrt_cluster_idx(); + + // Allocate space in TCDM + local_x = (double *)snrt_l1_next(); + local_y = local_x + frac; + local_z = local_y + frac; + + // Copy data in TCDM + if (snrt_is_dm_core()) { + size_t size = frac * sizeof(double); + snrt_dma_start_1d(local_x, remote_x, size); + snrt_dma_start_1d(local_y, remote_y, size); + snrt_dma_wait_all(); + } + + // Calculate TCDM size and pointers for each core + int core_idx = snrt_cluster_core_idx(); + int frac_core = n / snrt_cluster_compute_core_num(); + int offset_core = core_idx * frac_core; + local_x += offset_core; + local_y += offset_core; + local_z += core_idx; + + snrt_cluster_hw_barrier(); + + // Compute + if (!snrt_is_dm_core()) { + start_cycle = snrt_mcycle(); + dotp_seq_4_acc(frac_core, local_x, local_y, local_z); + snrt_cluster_hw_barrier(); + +#ifndef _DOTP_EXCLUDE_FINAL_SYNC_ + if (!snrt_cluster_core_idx()) { + sum = 0; + for (uint32_t i = 0; i < snrt_cluster_compute_core_num(); ++i) { + sum += local_z[i]; + } + } + snrt_fpu_fence(); +#endif + + end_cycle = snrt_mcycle(); + } + + snrt_cluster_hw_barrier(); + + if (!snrt_cluster_core_idx()) { + unsigned int runtime = end_cycle - start_cycle; + double performance = (double) (2 * n - 1) / runtime; + double util = 100 * (performance / (2 * snrt_cluster_compute_core_num())); + + printf("Core %d execution time: %u cycles\nPerformance: %f DP-FLOP/Cycle\nUtilization: %f%%\n", + snrt_cluster_core_idx(), runtime, performance, util); + } + + snrt_cluster_hw_barrier(); + + // Copy data out of TCDM + if (snrt_is_dm_core()) { + size_t size = frac_core * sizeof(double); + snrt_dma_start_1d(remote_z, local_z, size); + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + +// TODO: currently only works for single cluster otherwise need to +// synchronize all cores here +#ifdef BIST + uint32_t nerr = 1; + + // Check computation is correct + if (snrt_global_core_idx() == 0) { + if (sum == g) nerr--; + printf("%f %f\n", sum, g); + } + + return nerr; +#endif + + return 0; +} diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk index 1415fcb4e4..329b606a57 100644 --- a/target/snitch_cluster/sw.mk +++ b/target/snitch_cluster/sw.mk @@ -42,6 +42,7 @@ APPS = sw/apps/lto APPS += sw/apps/nop APPS += sw/apps/blas/axpy APPS += sw/apps/blas/gemm +APPS += sw/apps/blas/dotp APPS += sw/apps/dnn/batchnorm APPS += sw/apps/dnn/conv2d APPS += sw/apps/dnn/fusedconv diff --git a/target/snitch_cluster/sw/apps/blas/dotp/Makefile b/target/snitch_cluster/sw/apps/blas/dotp/Makefile new file mode 100644 index 0000000000..63f748994d --- /dev/null +++ b/target/snitch_cluster/sw/apps/blas/dotp/Makefile @@ -0,0 +1,10 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Matteo Perotti + +include ../../../../../../sw/blas/dotp/Makefile +include ../../common.mk + +$(DEP): $(DATA_H)