diff --git a/sw/dnn/gemm/data/datagen.py b/sw/dnn/gemm/data/datagen.py deleted file mode 100755 index 17f4df0496..0000000000 --- a/sw/dnn/gemm/data/datagen.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2023 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Tim Fischer -# Viviane Potocnik -# Luca Colagrande - -import argparse -import pathlib -import json5 -import sys -import os -import torch - -sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) -import data_utils # noqa: E402 -from data_utils import emit_license, \ - format_struct_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper # noqa: E402 - -torch.manual_seed(42) - -# AXI splits bursts crossing 4KB address boundaries. To minimize -# the occurrence of these splits the data should be aligned to 4KB -BURST_ALIGNMENT = 4096 - - -def rand_data_generator(shape, prec, alt=False): - if prec == 'FP64': - return torch.randn(shape, requires_grad=False, dtype=torch.float64), {} - elif prec == 'FP32': - return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} - elif prec == 'FP16': - if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} - else: - return torch.randn(shape, requires_grad=False, dtype=torch.float16), {} - elif prec == 'FP8': - sign = torch.randint(0, 2, shape, - requires_grad=False, dtype=torch.uint8) # -1 or 1 - exponent = torch.randint(0, 16, shape, - requires_grad=False, dtype=torch.uint8) # < 0b01111 - mantissa = torch.randint(0, 4, shape, - requires_grad=False, dtype=torch.uint8) # can be arbitrary - bits = {'sign': sign, 'exponent': exponent, 'mantissa': mantissa} - # TODO: not actually correct - sign_val = (-1.0)**sign.double() - exp_val = (2.0**(exponent.double()-15.0)) - man_val = (1.0 + mantissa.double() / (2**2)) - val = sign_val*exp_val*man_val - return val, bits - - -def golden_model(alpha, A, B, C): - return alpha * C + torch.matmul(A, B) - - -def emit_header(**kwargs): - - M = kwargs['M'] - N = kwargs['N'] - K = kwargs['K'] - alpha = kwargs['alpha'] - expand = kwargs['expand'] - transpose_A = kwargs['transpose_A'] - transpose_B = kwargs['transpose_B'] - prec = kwargs['prec'] - - print(prec) - mat_A, bits_A = rand_data_generator((M, K), prec) - mat_B, bits_B = rand_data_generator((K, N), prec) - mat_C, bits_C = rand_data_generator((M, N), prec) - - result = golden_model(alpha, mat_A, mat_B, mat_C) - - if transpose_A: - mat_A = mat_A.T - if transpose_B: - mat_B = mat_B.T - - ctype = data_utils.ctype_from_precision_t(prec) - - A_uid = 'A' - B_uid = 'B' - C_uid = 'C' - - layer_cfg = { - 'M': M, - 'N': N, - 'K': K, - 'TA': int(transpose_A), - 'TB': int(transpose_B), - 'ALPHA': alpha, - 'expand': expand, - 'dtype': prec, - 'A': A_uid, - 'B': B_uid, - 'C': C_uid - } - - data_str = [emit_license()] - # Array forward declarations - data_str += [format_array_declaration(ctype, A_uid, mat_A.shape)] - data_str += [format_array_declaration(ctype, B_uid, mat_B.shape)] - data_str += [format_array_declaration(ctype, C_uid, mat_C.shape)] - # Layer struct - data_str += [format_struct_definition('gemm_layer_t', 'layer', layer_cfg)] - # Array definitions - if prec == 'FP8': - data_str += [format_array_definition(ctype, A_uid, bits_A)] - data_str += [format_array_definition(ctype, B_uid, bits_B)] - data_str += [format_array_definition(ctype, C_uid, bits_C)] - else: - data_str += [format_array_definition(ctype, A_uid, mat_A)] - data_str += [format_array_definition(ctype, B_uid, mat_B)] - data_str += [format_array_definition(ctype, C_uid, mat_C)] - # Golden results for BIST - result_def = format_array_definition(ctype, 'checksum', torch.sum(result, dim=-1)) - data_str += [format_ifdef_wrapper('BIST', result_def)] - data_str = '\n\n'.join(data_str) - - return data_str - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') - parser.add_argument( - "-c", "--cfg", - type=pathlib.Path, - required=True, - help='Select param config file kernel' - ) - parser.add_argument( - '--section', - type=str, - help='Section to store matrices in') - parser.add_argument( - 'output', - type=pathlib.Path, - help='Path of the output header file') - args = parser.parse_args() - - # Load param config file - with args.cfg.open() as f: - param = json5.loads(f.read()) - param['section'] = args.section - - # Emit header file - with open(args.output, 'w') as f: - f.write(emit_header(**param)) - - -if __name__ == '__main__': - main() diff --git a/sw/dnn/gemm/data/params.json b/sw/dnn/gemm/data/params.json deleted file mode 100644 index 11badbf202..0000000000 --- a/sw/dnn/gemm/data/params.json +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -{ - M: 16, - N: 16, - K: 16, - alpha: 0, - transpose_A: false, - transpose_B: true, - prec: "FP32", - expand: 0 -} diff --git a/sw/dnn/gemm/src/gemm.h b/sw/dnn/gemm/src/gemm.h deleted file mode 100644 index cf2b2949e0..0000000000 --- a/sw/dnn/gemm/src/gemm.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include "blas.h" - -/** - * @struct gemm_layer_struct - * @brief This structure contains all parameters necessary for GEMM. - * @var gemm_layer_struct::M - * Dimension of matrix product MxK * KxN - * @var gemm_layer_struct::M_p - * M divided by number of compute cores - * @var gemm_layer_struct::N - * Dimension of matrix product MxK * KxN - * @var gemm_layer_struct::K - * Dimension of matrix product MxK * KxN - * @var gemm_layer_struct::TA - * Transpose matrix A - * @var gemm_layer_struct::TB - * Transpose matrix B - * @var gemm_layer_struct::TILE_M - * Tile factor across M dimension - * @var gemm_layer_struct::TILE_N - * Tile factor across N dimension - * @var gemm_layer_struct::TILE_K - * Tile factor across K dimension - * @var gemm_layer_struct::A - * Pointer to matrix A - * @var gemm_layer_struct::B - * Pointer to matrix B - * @var gemm_layer_struct::C - * Pointer to matrix C - * @var gemm_layer_struct::ALPHA - * constant factor: A * B + ALPHA * C - * @var gemm_layer_struct::dtype - * Precision of GEMM - * @var gemm_layer_struct::expand - * Use expanding DOTP instructions - */ -typedef struct gemm_layer_struct { - uint32_t M; - uint32_t M_p; - uint32_t N; - uint32_t K; - - uint32_t TA; - uint32_t TB; - - uint32_t TILE_M; - uint32_t TILE_N; - uint32_t TILE_K; - - void *A; - void *B; - void *C; - - uint32_t ALPHA; - - precision_t dtype; - uint32_t expand; -} gemm_layer_t; diff --git a/sw/dnn/gemm/src/main.c b/sw/dnn/gemm/src/main.c deleted file mode 100644 index 62ca12cf29..0000000000 --- a/sw/dnn/gemm/src/main.c +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// SW testbench for profiling GEMM kernels in different -// floating point precisions (fp64, fp32, fp16), as well as -// different memory layouts for matrices (transposed/not-transposed) - -#define BIST - -#include "dnn.h" -#include "snrt.h" - -#include "data.h" - -// Padding of innermost dimension of a Matrix -// Useful for preventing banking conflicts between cores -// that are accessing different rows of the matrix -#define MAT_ROW_PADDING 0 - -// Padding in between matrices A, B for preventing -// banking conflicts in the beginning -#define MAT_PADDING 0 - -void *share_ptr; - -int main() { - const gemm_layer_t l1_gemm_l = layer; - - const uint32_t cluster_num = snrt_cluster_num(); - const uint32_t cluster_id = snrt_cluster_idx(); - const uint32_t compute_num = snrt_cluster_compute_core_num(); - const uint32_t compute_id = snrt_global_core_idx(); - - void *mat_A, *mat_B, *mat_C; - - uint32_t mat_A_size = - (l1_gemm_l.M * (l1_gemm_l.K + MAT_ROW_PADDING) + MAT_PADDING) * - l1_gemm_l.dtype; - uint32_t mat_B_size = - (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.N * l1_gemm_l.dtype; - uint32_t mat_C_size = l1_gemm_l.M * l1_gemm_l.N * l1_gemm_l.dtype; - - uint32_t total_size = mat_A_size + mat_B_size + mat_C_size; - - void *ptr; - - if (compute_id == 0) { - ptr = snrt_l1alloc(total_size); - share_ptr = ptr; - } - - snrt_cluster_hw_barrier(); - - ptr = share_ptr; - - mat_A = ptr; - ptr += (l1_gemm_l.M * (l1_gemm_l.K + MAT_ROW_PADDING) + MAT_PADDING) * - l1_gemm_l.dtype; - mat_B = ptr; - ptr += (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.N * l1_gemm_l.dtype; - mat_C = ptr; - ptr += l1_gemm_l.M * l1_gemm_l.N * l1_gemm_l.dtype; - - uint32_t errors = 0; - - snrt_global_barrier(); - - if (snrt_is_dm_core()) { - snrt_dma_txid_t txid_A = - snrt_dma_start_2d(mat_A, l1_gemm_l.A, l1_gemm_l.dtype * l1_gemm_l.K, - l1_gemm_l.dtype * (l1_gemm_l.K + MAT_ROW_PADDING), - l1_gemm_l.dtype * l1_gemm_l.K, l1_gemm_l.M); - snrt_dma_txid_t txid_B = - snrt_dma_start_2d(mat_B, l1_gemm_l.B, l1_gemm_l.dtype * l1_gemm_l.K, - l1_gemm_l.dtype * (l1_gemm_l.K + MAT_ROW_PADDING), - l1_gemm_l.dtype * l1_gemm_l.K, l1_gemm_l.N); - - snrt_dma_txid_t txid_C = snrt_dma_start_1d( - mat_C, l1_gemm_l.C, l1_gemm_l.dtype * l1_gemm_l.M * l1_gemm_l.N); - - snrt_dma_wait_all(); - } - - snrt_cluster_hw_barrier(); - - if (snrt_is_compute_core()) { - const uint32_t setup_SSR = 1; - - if (!l1_gemm_l.TA && !l1_gemm_l.TB) { - volatile uint32_t A_offset = - compute_id * (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.dtype; - volatile uint32_t C_offset = - compute_id * l1_gemm_l.N * l1_gemm_l.dtype; - volatile uint32_t ldA = - compute_num * (l1_gemm_l.K + MAT_ROW_PADDING); - volatile uint32_t ldB = l1_gemm_l.N + MAT_ROW_PADDING; - volatile uint32_t ldC = l1_gemm_l.N * compute_num; - - snrt_mcycle(); - gemm_fp64_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, l1_gemm_l.K, - &mat_A[A_offset], ldA, l1_gemm_l.TA, mat_B, ldB, - l1_gemm_l.TB, &mat_C[C_offset], ldC, &l1_gemm_l.ALPHA, - setup_SSR); - snrt_mcycle(); - } else if (!l1_gemm_l.TA && l1_gemm_l.TB) { - volatile uint32_t A_offset = - compute_id * (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.dtype; - volatile uint32_t C_offset = - compute_id * l1_gemm_l.N * l1_gemm_l.dtype; - volatile uint32_t ldA = - compute_num * (l1_gemm_l.K + MAT_ROW_PADDING); - volatile uint32_t ldB = l1_gemm_l.K + MAT_ROW_PADDING; - volatile uint32_t ldC = l1_gemm_l.N * compute_num; - - snrt_mcycle(); - switch (l1_gemm_l.dtype) { - case FP64: - gemm_fp64_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, - l1_gemm_l.K, &mat_A[A_offset], ldA, - l1_gemm_l.TA, mat_B, ldB, l1_gemm_l.TB, - &mat_C[C_offset], ldC, &l1_gemm_l.ALPHA, - setup_SSR); - break; - case FP32: - gemm_fp32_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, - l1_gemm_l.K, &mat_A[A_offset], ldA, mat_B, - ldB, &mat_C[C_offset], ldC, &l1_gemm_l.ALPHA, - setup_SSR); - break; - case FP16: - if (l1_gemm_l.expand) { - gemm_fp16_ex_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, - l1_gemm_l.K, &mat_A[A_offset], ldA, - mat_B, ldB, &mat_C[C_offset], ldC, - &l1_gemm_l.ALPHA, setup_SSR); - } else { - gemm_fp16_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, - l1_gemm_l.K, &mat_A[A_offset], ldA, mat_B, - ldB, &mat_C[C_offset], ldC, - &l1_gemm_l.ALPHA, setup_SSR); - } - break; - case FP8: - gemm_fp8_ex_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, - l1_gemm_l.K, &mat_A[A_offset], ldA, mat_B, - ldB, &mat_C[C_offset], ldC, - &l1_gemm_l.ALPHA, setup_SSR); - break; - } - snrt_mcycle(); - } else if (l1_gemm_l.TA) { - printf("transpose TA not supported\n"); - } - snrt_cluster_hw_barrier(); - } else { - snrt_cluster_hw_barrier(); - } - snrt_cluster_hw_barrier(); - -#ifdef BIST - - if (compute_id == 0) { - if (l1_gemm_l.dtype == FP64) { - for (uint32_t m = 0; m < l1_gemm_l.M; m++) { - double check = checksum[m]; - double sum = 0.0; - for (uint32_t n = 0; n < l1_gemm_l.N; n++) { - sum += ((double *)mat_C)[m * l1_gemm_l.N + n]; - } - if (fabs(sum - check) > 0.001) { - errors += l1_gemm_l.N; - } - } - } else if (l1_gemm_l.dtype == FP32) { - for (uint32_t m = 0; m < l1_gemm_l.M; m++) { - float check = checksum[m]; - float sum = 0.0; - for (uint32_t n = 0; n < l1_gemm_l.N; n++) { - sum += ((float *)mat_C)[m * l1_gemm_l.N + n]; - } - if (fabs(sum - check) > 0.001) { - errors += l1_gemm_l.N; - } - } - } else if (l1_gemm_l.dtype == FP16) { - for (uint32_t m = 0; m < l1_gemm_l.M; m++) { - __fp16 check = checksum[m]; - float sum = 0.0; - for (uint32_t n = 0; n < l1_gemm_l.N; n++) { - sum += ((__fp16 *)mat_C)[m * l1_gemm_l.N + n]; - } - if (fabs(sum - check) > 0.05) { - errors += l1_gemm_l.N; - } - } - } else if (l1_gemm_l.dtype == FP8) { - printf("No golden model yet for fp8!\n"); - } - printf("%d/%d Errors\n", errors, l1_gemm_l.M * l1_gemm_l.N); - } - -#endif - - // TODO: change back!!! - return 0; -} diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h index 3d0bea4682..e0d6e6e0e5 100644 --- a/sw/dnn/src/dnn.h +++ b/sw/dnn/src/dnn.h @@ -200,7 +200,6 @@ typedef struct network_single_cluster_t_ { #include "../flashattention_2/src/flashattention_2.h" #include "../fused_concat_linear/src/fused_concat_linear.h" #include "../gelu/src/gelu.h" -#include "../gemm/src/gemm.h" #include "../layernorm/src/layernorm.h" #include "../maxpool/src/maxpool.h" #include "../softmax/src/softmax.h" diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile index 3c4da406ab..a234b73906 100644 --- a/target/snitch_cluster/sw/apps/Makefile +++ b/target/snitch_cluster/sw/apps/Makefile @@ -13,7 +13,6 @@ SUBDIRS += dnn/batchnorm SUBDIRS += dnn/conv2d SUBDIRS += dnn/fusedconv SUBDIRS += dnn/gelu -SUBDIRS += dnn/gemm SUBDIRS += dnn/layernorm SUBDIRS += dnn/maxpool SUBDIRS += dnn/softmax diff --git a/target/snitch_cluster/sw/apps/dnn/gemm/Makefile b/target/snitch_cluster/sw/apps/dnn/gemm/Makefile deleted file mode 100644 index 48a31215e5..0000000000 --- a/target/snitch_cluster/sw/apps/dnn/gemm/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright 2023 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Luca Colagrande - -APP ?= gemm - -include ../../../../../../sw/dnn/common.mk -include ../../common.mk - -$(DEP): $(DATA_H) \ No newline at end of file diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index cb0a386f5a..bf211effe0 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -75,7 +75,6 @@ runs: cmd: [../../../sw/blas/gemm/verify.py, "${sim_bin}", "${elf}"] - elf: apps/dnn/batchnorm/build/batchnorm.elf - elf: apps/dnn/maxpool/build/maxpool.elf - - elf: apps/dnn/gemm/build/gemm.elf # - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results # cmd: [../../../sw/dnn/conv2d/verify.py, "${sim_bin}", "${elf}"] # - elf: apps/dnn/fusedconv/build/fusedconv.elf # Fails with wrong results