From 1eb0e45e7f9bbfa4016b5184968778787e99ed49 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 25 Oct 2023 17:58:07 +0200 Subject: [PATCH] Temporary DNN refactoring --- sw/blas/axpy/data/datagen.py | 13 +- sw/blas/gemm/data/datagen.py | 12 +- sw/dnn/{src => batchnorm}/batchnorm.h | 0 sw/dnn/common.mk | 31 +++++ sw/dnn/{src => conv2d}/conv2d.h | 0 sw/dnn/{src => gelu}/gelu.h | 0 sw/dnn/{src => gemm}/gemm.h | 0 sw/dnn/layernorm/.gitignore | 1 + sw/dnn/layernorm/data/datagen.py | 99 +++++++++++++++ .../dnn/layernorm/data}/params.hjson | 5 +- sw/dnn/{ => layernorm}/src/layernorm.h | 85 ++++--------- sw/dnn/layernorm/src/main.c | 116 ++++++++++++++++++ sw/dnn/layernorm/verify.py | 0 sw/dnn/{src => linear}/linear.h | 0 sw/dnn/{src => maxpool}/maxpool.h | 0 sw/dnn/{src => softmax}/softmax.h | 0 sw/dnn/src/dnn.h | 20 +-- target/snitch_cluster/sw/apps/Makefile | 18 +-- target/snitch_cluster/sw/apps/common.mk | 1 + .../sw/apps/dnn/layernorm/Makefile | 6 +- .../sw/apps/dnn/layernorm/src/layernorm.c | 29 ----- util/sim/data_utils.py | 60 +++++++-- 22 files changed, 355 insertions(+), 141 deletions(-) rename sw/dnn/{src => batchnorm}/batchnorm.h (100%) create mode 100644 sw/dnn/common.mk rename sw/dnn/{src => conv2d}/conv2d.h (100%) rename sw/dnn/{src => gelu}/gelu.h (100%) rename sw/dnn/{src => gemm}/gemm.h (100%) create mode 100644 sw/dnn/layernorm/.gitignore create mode 100755 sw/dnn/layernorm/data/datagen.py rename {target/snitch_cluster/sw/apps/dnn/layernorm/src => sw/dnn/layernorm/data}/params.hjson (64%) rename sw/dnn/{ => layernorm}/src/layernorm.h (71%) create mode 100644 sw/dnn/layernorm/src/main.c create mode 100644 sw/dnn/layernorm/verify.py rename sw/dnn/{src => linear}/linear.h (100%) rename sw/dnn/{src => maxpool}/maxpool.h (100%) rename sw/dnn/{src => softmax}/softmax.h (100%) delete mode 100644 target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c diff --git a/sw/blas/axpy/data/datagen.py b/sw/blas/axpy/data/datagen.py index f7ae7a6488..fc25cd40af 100755 --- a/sw/blas/axpy/data/datagen.py +++ b/sw/blas/axpy/data/datagen.py @@ -11,8 +11,8 @@ import os sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) -from data_utils import format_scalar_definition, format_vector_definition, \ - format_vector_declaration, format_ifdef_wrapper # noqa: E402 +from data_utils import format_scalar_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 MIN = -1000 MAX = +1000 @@ -47,16 +47,15 @@ def main(): a = np.random.uniform(MIN, MAX, 1) x = np.random.uniform(MIN, MAX, length) y = np.random.uniform(MIN, MAX, length) - z = np.zeros(length) g = golden_model(a, x, y) # Format header file l_str = format_scalar_definition('const uint32_t', 'l', length) a_str = format_scalar_definition('const double', 'a', a[0]) - x_str = format_vector_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section) - y_str = format_vector_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section) - z_str = format_vector_declaration('double', 'z', z, alignment=BURST_ALIGNMENT, section=section) - g_str = format_vector_definition('double', 'g', g) + x_str = format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section) + y_str = format_array_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section) + z_str = format_array_declaration('double', 'z', [length], alignment=BURST_ALIGNMENT, section=section) + g_str = format_array_definition('double', 'g', g) g_str = format_ifdef_wrapper('BIST', g_str) f_str = '\n\n'.join([l_str, a_str, x_str, y_str, z_str, g_str]) f_str += '\n' diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py index 0ccab83817..a3e25539fd 100755 --- a/sw/blas/gemm/data/datagen.py +++ b/sw/blas/gemm/data/datagen.py @@ -15,7 +15,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) from data_utils import emit_license, format_scalar_definition, \ - format_vector_definition, format_ifdef_wrapper # noqa: E402 + format_array_definition, format_ifdef_wrapper # noqa: E402 np.random.seed(42) @@ -100,16 +100,16 @@ def emit_header(**kwargs): data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])] data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)] data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])] - data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(), + data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(), alignment=BURST_ALIGNMENT, section=kwargs['section'])] - data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(), + data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(), alignment=BURST_ALIGNMENT, section=kwargs['section'])] - data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(), + data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(), alignment=BURST_ALIGNMENT, section=kwargs['section'])] if kwargs['prec'] == 8: - result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten()) + result_def = format_array_definition(C_TYPES['64'], 'result', result.flatten()) else: - result_def = format_vector_definition(C_TYPES[str(kwargs['prec'])], + result_def = format_array_definition(C_TYPES[str(kwargs['prec'])], 'result', result.flatten()) data_str += [format_ifdef_wrapper('BIST', result_def)] diff --git a/sw/dnn/src/batchnorm.h b/sw/dnn/batchnorm/batchnorm.h similarity index 100% rename from sw/dnn/src/batchnorm.h rename to sw/dnn/batchnorm/batchnorm.h diff --git a/sw/dnn/common.mk b/sw/dnn/common.mk new file mode 100644 index 0000000000..0d933719de --- /dev/null +++ b/sw/dnn/common.mk @@ -0,0 +1,31 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +# Usage of absolute paths is required to externally include this Makefile +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) + +DATA_DIR := $(realpath $(MK_DIR)/$(APP)/data) +SRC_DIR := $(realpath $(MK_DIR)/$(APP)/src) +COMMON_SRC_DIR := $(realpath $(MK_DIR)/src) + +DATA_CFG ?= $(DATA_DIR)/params.hjson +SECTION ?= + +SRCS ?= $(realpath $(SRC_DIR)/main.c) +INCDIRS ?= $(DATA_DIR) $(SRC_DIR) $(COMMON_SRC_DIR) + +DATAGEN_PY := $(DATA_DIR)/datagen.py +DATA_H := $(DATA_DIR)/data.h + +$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) + $< -c $(DATA_CFG) --section="$(SECTION)" > $@ + +.PHONY: clean-data clean + +clean-data: + rm -f $(DATA_H) + +clean: clean-data diff --git a/sw/dnn/src/conv2d.h b/sw/dnn/conv2d/conv2d.h similarity index 100% rename from sw/dnn/src/conv2d.h rename to sw/dnn/conv2d/conv2d.h diff --git a/sw/dnn/src/gelu.h b/sw/dnn/gelu/gelu.h similarity index 100% rename from sw/dnn/src/gelu.h rename to sw/dnn/gelu/gelu.h diff --git a/sw/dnn/src/gemm.h b/sw/dnn/gemm/gemm.h similarity index 100% rename from sw/dnn/src/gemm.h rename to sw/dnn/gemm/gemm.h diff --git a/sw/dnn/layernorm/.gitignore b/sw/dnn/layernorm/.gitignore new file mode 100644 index 0000000000..f5ac16baa2 --- /dev/null +++ b/sw/dnn/layernorm/.gitignore @@ -0,0 +1 @@ +data/data.h diff --git a/sw/dnn/layernorm/data/datagen.py b/sw/dnn/layernorm/data/datagen.py new file mode 100755 index 0000000000..2be1c989a1 --- /dev/null +++ b/sw/dnn/layernorm/data/datagen.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import pathlib +import hjson +import sys +import os +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +from data_utils import emit_license, format_scalar_definition, \ + format_struct_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 +import data_utils + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + + +def golden_model(ifmap, eps, shape, prec): + dtype = data_utils.floating_point_torch_type(prec) + ln = torch.nn.LayerNorm(shape, eps=eps)#, dtype=dtype) + return ln(ifmap) + + +def emit_header(**kwargs): + batch_size = kwargs['input_dim']['batch_size'] + seq_len = kwargs['input_dim']['seq_len'] + embeddings = kwargs['input_dim']['embeddings'] + eps = kwargs['eps'] + prec = str(kwargs['prec']) + + torch_type = data_utils.floating_point_torch_type(prec) + ifmap = torch.randn(batch_size, seq_len, embeddings, requires_grad=False, dtype=torch_type) + + ofmap = golden_model(ifmap, eps, embeddings, prec) + ofmap = ofmap.detach().numpy() + + ctype = data_utils.floating_point_ctype(prec) + checksum = torch.sum(ifmap, dim=-1) + + ifmap_uid = 'ifmap' + ofmap_uid = 'ofmap' + + layer_cfg = { + **kwargs['input_dim'], + 'ifmap': ifmap_uid, + 'ofmap': ofmap_uid + } + + data_str = [emit_license()] + data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape, + alignment=BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape, + alignment=BURST_ALIGNMENT)] + data_str += [format_struct_definition('layernorm_layer_t', 'layer', layer_cfg)] + data_str += [format_array_definition(ctype, ifmap_uid, ifmap, + alignment=BURST_ALIGNMENT)] + result_def = format_array_definition(ctype, 'golden', ofmap, alignment=BURST_ALIGNMENT) + data_str += [format_ifdef_wrapper('BIST', result_def)] + data_str = '\n\n'.join(data_str) + + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + param['section'] = args.section + + # Emit header file + print(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson b/sw/dnn/layernorm/data/params.hjson similarity index 64% rename from target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson rename to sw/dnn/layernorm/data/params.hjson index a9e3fca54a..a12036b254 100644 --- a/target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson +++ b/sw/dnn/layernorm/data/params.hjson @@ -1,11 +1,8 @@ -// Copyright 2020 ETH Zurich and University of Bologna. +// Copyright 2023 ETH Zurich and University of Bologna. // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// Parameters for a single LayerNorm layer - { - kernel: "LayerNorm" input_dim: { batch_size: 1, seq_len: 32, diff --git a/sw/dnn/src/layernorm.h b/sw/dnn/layernorm/src/layernorm.h similarity index 71% rename from sw/dnn/src/layernorm.h rename to sw/dnn/layernorm/src/layernorm.h index b875303146..3ce0c0c27b 100644 --- a/sw/dnn/src/layernorm.h +++ b/sw/dnn/layernorm/src/layernorm.h @@ -7,7 +7,7 @@ #include "math.h" #include "snrt.h" // #include "printf.h" -#include "utils.h" +#include "dnn.h" // add dump function for layernorm dump_float(ln, 5); @@ -16,29 +16,24 @@ dump_float(ln, 5); * @struct layernorm_layer_struct * @brief This structure contains all parameters necessary * for computing the LayerNorm activation function - * @var layernorm_layer_struct::BATCH_SIZE + * @var layernorm_layer_struct::batch_size * Size of each input sample - * @var layernorm_layer_struct::SEQ_LEN + * @var layernorm_layer_struct::seq_len * Size of each output sample - * @var layernorm_layer_struct::EMBEDDINGS + * @var layernorm_layer_struct::embeddings * Number of hidden dimensions * @var layernorm_layer_struct::ifmap * Pointer to input feature map * @var layernorm_layer_struct::ofmap * Pointer to output feature map - * @var layernorm_layer_struct::result - * Pointer to the golden model output */ typedef struct layernorm_layer_struct { - uint32_t BATCH_SIZE; - uint32_t SEQ_LEN; - uint32_t EMBEDDINGS; - uint32_t EPS; - - float *ifmap; - float *ofmap; - float *result; - + uint32_t batch_size; + uint32_t seq_len; + uint32_t embeddings; + uint32_t eps; + void *ifmap; + void *ofmap; precision_t dtype; } layernorm_layer_t; @@ -95,44 +90,8 @@ static inline void layernorm_fp32(float *input, float *output, int32_t ldI, */ static inline void transformer_layernorm_fp64(double *input, int32_t ldI, int32_t seq_len, int32_t embeddings, - int32_t eps) { - double mean = 0.0; // max value of the current core - double var = 0.0; // sum of the exp values of the current core - - uint32_t compute_id = snrt_global_core_idx(); - uint32_t num_cores = snrt_cluster_compute_core_num(); - - for (int32_t s = 0; s < seq_len; s++) { - mean = 0.0; - var = 0.0; - - for (int32_t i = 0; i < embeddings; i++) { - mean += input[s * ldI + i]; - } - mean /= embeddings; - - // printf("mean[%d] = %f\n", b, mean); - - for (int32_t i = 0; i < embeddings; i++) { - var += (input[s * ldI + i] - mean) * - (input[s * ldI + i] - mean); - } - var /= embeddings; - - // printf("var[%d] = %f\n", b, var); - - // compute the shifted value of the current row - for (int32_t i = 0; i < embeddings; i++) { - input[s * ldI + i] = - (input[s * ldI + i] - mean) / - sqrtf(var + eps); - // printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i, - // output[s * ldI + i]); - // dump_ln(input[s * ldI + i]); - } - } - - snrt_cluster_hw_barrier(); + int32_t eps) { + layernorm_fp64(input, input, ldI, 0, 1, seq_len, embeddings, eps); } @@ -187,14 +146,14 @@ static inline void transformer_layernorm_fp32(float *input, int32_t ldI, * @param l layernorm_layer struct that holds addresses and parameters * */ -static inline void layernorm_layer(const layernorm_layer_t *l) { +static inline void layernorm_layer(layernorm_layer_t l) { uint32_t cluster_num = snrt_cluster_num(); uint32_t cluster_id = snrt_cluster_idx(); uint32_t compute_num = snrt_cluster_compute_core_num(); uint32_t compute_id = snrt_global_core_idx(); uint32_t ifmap_size = - l->BATCH_SIZE * l->SEQ_LEN * l->EMBEDDINGS * sizeof(float); + l.batch_size * l.seq_len * l.embeddings * sizeof(float); uint32_t ofmap_size = ifmap_size; void *ptr = (float *)snrt_l1_next(); @@ -206,9 +165,9 @@ static inline void layernorm_layer(const layernorm_layer_t *l) { // DMA transfer the ifmap into the cluster TCDM if (snrt_is_dm_core()) { snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d( - ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float), - l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float), - l->SEQ_LEN * l->EMBEDDINGS * sizeof(float)); + ifmap, l.ifmap, l.batch_size * sizeof(float), + l.batch_size * sizeof(float), l.batch_size * sizeof(float), + l.seq_len * l.embeddings * sizeof(float)); snrt_dma_wait_all(); } @@ -217,18 +176,18 @@ static inline void layernorm_layer(const layernorm_layer_t *l) { if (snrt_is_compute_core()) { // determine the row offset for each core - int32_t row_offset = compute_id * l->EMBEDDINGS; + int32_t row_offset = compute_id * l.embeddings; // determine the row stride of each matrix - int32_t ldI = compute_num * l->EMBEDDINGS; + int32_t ldI = compute_num * l.embeddings; // determine the batch offset for each core - int32_t batch_offset = l->SEQ_LEN * l->EMBEDDINGS; + int32_t batch_offset = l.seq_len * l.embeddings; // printf("row_offset: %d, ldI: %d\n", row_offset, ldI); layernorm_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI, - batch_offset, l->BATCH_SIZE, l->SEQ_LEN / 8, - l->EMBEDDINGS, l->EPS); + batch_offset, l.batch_size, l.seq_len / 8, + l.embeddings, l.eps); } else { snrt_cluster_hw_barrier(); diff --git a/sw/dnn/layernorm/src/main.c b/sw/dnn/layernorm/src/main.c new file mode 100644 index 0000000000..badc8aefdb --- /dev/null +++ b/sw/dnn/layernorm/src/main.c @@ -0,0 +1,116 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +// #include "snrt.h" +#include "layernorm.h" + +#include "data.h" + +int main() { + layernorm_layer(layer); + + +// void *local_a, *local_b, *local_c; +// void *remote_a, *remote_b, *remote_c; + +// // Calculate size and pointers for each cluster +// uint32_t frac_m = M / snrt_cluster_num(); +// uint32_t frac_a = frac_m * K; +// uint32_t frac_c = frac_m * N; +// uint32_t size_frac_a = frac_a * dtype_size; +// uint32_t size_b = K * N * dtype_size; +// uint32_t size_frac_c = frac_c * dtype_size; +// uint32_t offset_a = frac_a * snrt_cluster_idx(); +// uint32_t offset_c = frac_c * snrt_cluster_idx(); +// remote_a = a + offset_a; +// remote_b = b; +// remote_c = c + offset_c; + +// // Allocate space in TCDM +// local_a = (void *)snrt_l1_next(); +// local_b = local_a + size_frac_a; +// local_c = local_b + size_b; + +// // Copy data in TCDM +// if (snrt_is_dm_core()) { +// snrt_dma_start_1d(local_a, remote_a, size_frac_a); +// snrt_dma_start_1d(local_b, remote_b, size_b); +// snrt_dma_start_1d(local_c, remote_c, size_frac_c); +// snrt_dma_wait_all(); +// } + +// snrt_cluster_hw_barrier(); + +// // Compute +// if (!snrt_is_dm_core()) { +// const uint32_t setup_ssr = 1; +// uint32_t start_cycle = snrt_mcycle(); + +// volatile uint32_t lda = K; +// volatile uint32_t ldb = N; +// volatile uint32_t ldc = N; + +// // Transpose of A unsopported +// if (TA) return -1; +// if (TB) { +// // Transpose of B supported only in FP64 +// if (dtype_size != FP64) return -1; +// ldb = K; +// } + +// gemm(dtype_size, expand, setup_ssr, TA, TB, frac_m, N, K, 1, local_a, +// lda, local_b, ldb, BETA, local_c, ldc); + +// uint32_t end_cycle = snrt_mcycle(); +// } + +// snrt_cluster_hw_barrier(); + +// // Copy data out of TCDM +// if (snrt_is_dm_core()) { +// snrt_dma_start_1d(remote_c, local_c, size_frac_c); +// snrt_dma_wait_all(); +// } + +// // TODO: currently only works for single cluster otherwise need to +// // synchronize all cores here +// #ifdef BIST +// uint32_t errors = M * N; + +// if (snrt_cluster_core_idx() == 0) { +// for (uint32_t m = 0; m < M; m++) { +// for (uint32_t n = 0; n < N; n++) { +// uint32_t idx = m * N + n; +// switch (dtype_size) { +// case FP64: +// if (fabs(result[idx] - ((double *)local_c)[idx]) > +// 0.001) +// errors--; +// break; +// case FP32: +// if (fabs(result[idx] - ((float *)local_c)[idx]) > 0.001) +// errors--; +// break; +// case FP16: +// if (fabs(result[idx] - ((__fp16 *)local_c)[idx]) > +// 0.001) +// errors--; +// break; +// case FP8: +// printf("No golden model yet for fp8!\n"); +// return -1; +// break; +// } +// } +// } +// printf("%d/%d Errors\n", errors, M * N); +// } + +// return errors; +// #endif + +// return 0; +} diff --git a/sw/dnn/layernorm/verify.py b/sw/dnn/layernorm/verify.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sw/dnn/src/linear.h b/sw/dnn/linear/linear.h similarity index 100% rename from sw/dnn/src/linear.h rename to sw/dnn/linear/linear.h diff --git a/sw/dnn/src/maxpool.h b/sw/dnn/maxpool/maxpool.h similarity index 100% rename from sw/dnn/src/maxpool.h rename to sw/dnn/maxpool/maxpool.h diff --git a/sw/dnn/src/softmax.h b/sw/dnn/softmax/softmax.h similarity index 100% rename from sw/dnn/src/softmax.h rename to sw/dnn/softmax/softmax.h diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h index 537f488cd9..e228de21d1 100644 --- a/sw/dnn/src/dnn.h +++ b/sw/dnn/src/dnn.h @@ -197,13 +197,13 @@ typedef struct network_single_cluster_t_ { // Must be included before batchnorm since the batchnorm layer // uses the conv_layer struct. This is bad design. // TODO Fix this, union types should be preferred -#include "conv2d.h" - -#include "batchnorm.h" -#include "gelu.h" -#include "gemm.h" -#include "layernorm.h" -#include "linear.h" -#include "maxpool.h" -#include "softmax.h" -#include "utils.h" +// #include "conv2d.h" + +// #include "batchnorm.h" +// #include "gelu.h" +// #include "gemm.h" +// #include "layernorm.h" +// #include "linear.h" +// #include "maxpool.h" +// #include "softmax.h" +// #include "utils.h" diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile index 0410fb1cb4..219887ee6a 100644 --- a/target/snitch_cluster/sw/apps/Makefile +++ b/target/snitch_cluster/sw/apps/Makefile @@ -6,18 +6,18 @@ SUBDIRS = lto SUBDIRS += nop -SUBDIRS += transformer +# SUBDIRS += transformer SUBDIRS += blas/axpy SUBDIRS += blas/gemm -SUBDIRS += dnn/batchnorm -SUBDIRS += dnn/conv2d -SUBDIRS += dnn/fusedconv -SUBDIRS += dnn/gelu -SUBDIRS += dnn/gemm +# SUBDIRS += dnn/batchnorm +# SUBDIRS += dnn/conv2d +# SUBDIRS += dnn/fusedconv +# SUBDIRS += dnn/gelu +# SUBDIRS += dnn/gemm SUBDIRS += dnn/layernorm -SUBDIRS += dnn/linear -SUBDIRS += dnn/maxpool -SUBDIRS += dnn/softmax +# SUBDIRS += dnn/linear +# SUBDIRS += dnn/maxpool +# SUBDIRS += dnn/softmax SUBDIRS += montecarlo/pi_estimation .PHONY: all clean $(SUBDIRS) diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk index 8e1950860e..94eb35236a 100644 --- a/target/snitch_cluster/sw/apps/common.mk +++ b/target/snitch_cluster/sw/apps/common.mk @@ -37,6 +37,7 @@ INCDIRS += $(SNRT_DIR)/api INCDIRS += $(SNRT_DIR)/api/omp INCDIRS += $(SNRT_DIR)/src INCDIRS += $(SNRT_DIR)/src/omp +INCDIRS += $(ROOT)/sw/blas INCDIRS += $(ROOT)/sw/deps/riscv-opcodes INCDIRS += $(ROOT)/sw/math/include diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile b/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile index 87fa026c70..f8df5a08ac 100644 --- a/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile +++ b/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile @@ -2,11 +2,11 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Gianna Paulin +# Luca Colagrande -APP = layernorm +APP ?= layernorm -include ../Makefile +include ../../../../../../sw/dnn/common.mk include ../../common.mk $(DEP): $(DATA_H) diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c b/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c deleted file mode 100644 index fa776940f6..0000000000 --- a/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// SW testbench for profiling linear kernels in different -// floating point precisions (fp64, fp32, fp16), as well as -// different memory layouts for matrices (transposed/not-transposed) -// Correctness of results are checked automatically - -#include "dnn.h" -#include "snrt.h" - -#include "data.h" - -int main() { - layernorm_l.ifmap = (float*)layernorm_ifmap_dram; - layernorm_l.result = (float*)layernorm_ofmap_dram; - - // checksum = (float*)layernorm_checksum; - - // printf("Starting layernorm layer\n"); - - layernorm_layer(&layernorm_l); - - // uint32_t error = check_layernorm_layer(&linear_l, - // (float*)linear_checksum); - - return 0; -} \ No newline at end of file diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py index 2ed260d3f1..609cc708f8 100644 --- a/util/sim/data_utils.py +++ b/util/sim/data_utils.py @@ -6,15 +6,38 @@ import struct from datetime import datetime +import torch +import numpy as np def emit_license(): s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna.\n" f"// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n" - f"// SPDX-License-Identifier: Apache-2.0\n\n") + f"// SPDX-License-Identifier: Apache-2.0\n") return s +def floating_point_torch_type(precision): + prec_to_torch_type_map = { + '64': torch.float64, + '32': torch.float32, + '16': torch.float16, + '8': None + } + return prec_to_torch_type_map[precision] + + +# Returns the C type representing a floating-point value of the specified precision +def floating_point_ctype(precision): + prec_to_fp_type_map = { + '64': 'double', + '32': 'float', + '16': '__fp16', + '8': 'char' + } + return prec_to_fp_type_map[precision] + + def variable_attributes(alignment=None, section=None): attributes = '' if alignment: @@ -24,11 +47,27 @@ def variable_attributes(alignment=None, section=None): return attributes -def format_vector_definition(type, uid, vector, alignment=None, section=None): +def format_array_declaration(dtype, uid, shape, alignment=None, section=None): attributes = variable_attributes(alignment, section) - s = f'{type} {uid}[{len(vector)}] {attributes} = ' + '{\n' - for el in vector: - if type != 'char': + s = f'{dtype} {uid}' + for dim in shape: + s += f'[{dim}]' + s+= f' {attributes};' + return s + + +def format_array_definition(dtype, uid, array, alignment=None, section=None): + # Definition starts with the declaration stripped off of the terminating semicolon + s = format_array_declaration(dtype, uid, array.shape, alignment, section)[:-1] + s += ' = {\n' + # Flatten array + if isinstance(array, np.ndarray): + array = array.flat + if isinstance(array, torch.Tensor): + array = array.numpy().flat + # Format array elements + for el in array: + if dtype != 'char': el_str = f'{el}' else: el_str = f'0x{el:02x}' @@ -37,14 +76,15 @@ def format_vector_definition(type, uid, vector, alignment=None, section=None): return s -def format_vector_declaration(type, uid, vector, alignment=None, section=None): - attributes = variable_attributes(alignment, section) - s = f'{type} {uid}[{len(vector)}] {attributes};' +def format_scalar_definition(dtype, uid, scalar): + s = f'{dtype} {uid} = {scalar};' return s -def format_scalar_definition(type, uid, scalar): - s = f'{type} {uid} = {scalar};' +def format_struct_definition(dtype, uid, map): + s = f'{dtype} {uid} = {{\n' + s += ',\n'.join([f'\t.{key} = {value}' for (key, value) in map.items()]) + s += '\n};' return s