sw: Add transpose layer for fp32 and fp64

pulp-platform · Nov 7, 2023 · e990aac · e990aac
1 parent ca4d1e3
commit e990aac
Show file tree

Hide file tree

Showing 8 changed files with 351 additions and 0 deletions.
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
@@ -204,4 +204,5 @@ typedef struct network_single_cluster_t_ {
 #include "../linear/src/linear.h"
 #include "../maxpool/src/maxpool.h"
 #include "../softmax/src/softmax.h"
+#include "../transpose/src/transpose.h"
 // #include "utils.h"
diff --git a/sw/dnn/transpose/data/datagen.py b/sw/dnn/transpose/data/datagen.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tim Fischer <[email protected]>
+# Viviane Potocnik <[email protected]>
+# Luca Colagrande <[email protected]>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION_T = {
+    '64': 'FP64',
+    '32': 'FP32',
+    '16': 'FP16',
+    '8': 'FP8'
+}
+
+
+def golden_model(input):
+    return input.t()
+
+
+def emit_header(**kwargs):
+    M = kwargs['input_dim']['M']
+    N = kwargs['input_dim']['N']
+    prec = str(kwargs['prec'])
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    input = torch.randn(M, N, requires_grad=False, dtype=torch_type)
+
+    output = golden_model(input)
+    output = output.detach().numpy()
+
+    ctype = data_utils.floating_point_ctype(prec)
+
+    input_uid = 'input'
+    output_uid = 'output'
+
+    layer_cfg = {
+        **kwargs['input_dim'],
+        'input': input_uid,
+        'output': output_uid,
+        'dtype': PRECISION_T[prec]
+    }
+
+    data_str = [emit_license()]
+    data_str += [format_array_declaration(ctype, input_uid, input.shape,
+                 alignment=BURST_ALIGNMENT)]
+    data_str += [format_array_declaration(ctype, output_uid, output.shape,
+                 alignment=BURST_ALIGNMENT)]
+    data_str += [format_struct_definition('transpose2d_layer_t', 'layer', layer_cfg)]
+    data_str += [format_array_definition(ctype, input_uid, input,
+                 alignment=BURST_ALIGNMENT)]
+    result_def = format_array_definition(ctype, 'golden', output, alignment=BURST_ALIGNMENT)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sw/dnn/transpose/data/params.hjson b/sw/dnn/transpose/data/params.hjson
@@ -0,0 +1,11 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+{
+    input_dim: {
+        M: 64,
+        N: 64,
+    }
+    prec: 64
+}
diff --git a/sw/dnn/transpose/src/main.c b/sw/dnn/transpose/src/main.c
@@ -0,0 +1,14 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <[email protected]>
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    transpose2d_layer(layer);
+    return 0;
+}
diff --git a/sw/dnn/transpose/src/transpose.h b/sw/dnn/transpose/src/transpose.h
@@ -0,0 +1,122 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+#include "snrt.h"
+
+/**
+ * @struct transpose2d_layer_struct
+ * @brief This structure contains all parameters necessary
+ *       for computing the Transpose2D of a matrix
+* @var gemm_layer_struct::M
+ * First dimension of the matrix
+* @var gemm_layer_struct::N
+ * Second dimension of the matrix
+ * @var transpose2d_layer_struct::input
+ * Pointer to input feature map
+ * @var transpose2d_layer_struct::output
+ * Pointer to output feature map
+ */
+typedef struct transpose2d_layer_struct {
+    uint32_t M;
+    uint32_t N;
+    void *input;
+    void *output;
+    precision_t dtype;
+} transpose2d_layer_t;
+
+/**
+ * @brief Implementation of the FP64 Transpose2D kernel
+ *
+ * @param input Pointer to input feature map
+ * @param output Pointer to output feature map
+ * @param M First dimension of the matrix
+ * @param N Second dimension of the matrix
+ */
+static inline void transposed2d_fp64(double* input, double* output, uint32_t M, uint32_t N, uint32_t M_stride) {
+    for (uint32_t m = 0; m < M; m++) {
+        for (uint32_t n = 0; n < N; n++) {
+            output[n * M_stride + m] = input[m * N + n];
+        }
+    }
+}
+
+/**
+ * @brief Implementation of the FP32 Transpose2D kernel
+ *
+ * @param input Pointer to input feature map
+ * @param output Pointer to output feature map
+ * @param M First dimension of the matrix
+ * @param N Second dimension of the matrix
+ */
+static inline void transposed2d_fp32(float* input, float* output, uint32_t M, uint32_t N, uint32_t M_stride) {
+    for (uint32_t m = 0; m < M; m++) {
+        for (uint32_t n = 0; n < N; n++) {
+            output[n * M_stride + m] = input[m * N + n];
+        }
+    }
+}
+
+/**
+ * @brief  Transpose2D layer
+ *
+ * @param l transpose2D struct that holds addresses and parameters
+ *
+ */
+static inline void transpose2d_layer(transpose2d_layer_t const l) {
+    uint32_t cluster_num = snrt_cluster_num();
+    uint32_t cluster_id = snrt_cluster_idx();
+    uint32_t compute_num = snrt_cluster_compute_core_num();
+    uint32_t compute_id = snrt_global_core_idx();
+
+    uint32_t matrix_size = l.M * l.N;
+
+    void *ptr = snrt_l1_next();
+    void *input = ptr;
+    ptr += matrix_size * l.dtype;
+    void *output = ptr;
+    ptr += matrix_size * l.dtype;
+
+    // DMA transfer the matrix into the cluster TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(input, l.input, matrix_size * l.dtype);
+        snrt_dma_wait_all();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    if (snrt_is_compute_core()) {
+        // determine the row offset for each core
+        int32_t row_offset = compute_id * (l.M / compute_num);
+
+        // calculate the input address offset
+        void* input_offset = input + row_offset * l.N * l.dtype;
+
+        // caluclate the output address offset
+        void* output_offset = output + row_offset * l.dtype;
+
+        switch(l.dtype) {
+            case FP32:
+                transposed2d_fp32(input_offset, output_offset, l.M / compute_num, l.N, l.M);
+                break;
+            case FP64:
+                transposed2d_fp64(input_offset, output_offset, l.M / compute_num, l.N, l.M);
+                break;
+            default:
+                break;
+        }
+    }
+
+    snrt_cluster_hw_barrier();
+
+    // DMA transfer the output to DRAM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(l.output, output, matrix_size * l.dtype);
+        snrt_dma_wait_all();
+    }
+
+    snrt_global_barrier();
+}
diff --git a/sw/dnn/transpose/verify.py b/sw/dnn/transpose/verify.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <[email protected]>
+
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from data.datagen import golden_model
+
+sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
+import verification  # noqa: E402
+from elf import Elf  # noqa: E402
+from data_utils import bytes_to_float, bytes_to_struct  # noqa: E402
+
+
+ERR_THRESHOLD = 0.003
+
+PRECISION_T = {
+    8: '64',
+    4: '32',
+    2: '16',
+    1: '8'
+}
+
+NUMPY_T = {
+    '64': np.float64,
+    '32': np.float32,
+    '16': np.float16
+}
+
+
+def main():
+    # Run simulation and get outputs
+    args = verification.parse_args()
+    raw_results = verification.simulate(sim_bin=args.sim_bin,
+                                        snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
+                                        log=args.log,
+                                        output_uids=['output'])
+
+    # Extract input operands from ELF file
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
+
+    layer_struct = {
+        'M': 'I',
+        'N': 'I',
+        'input_ptr': 'I',
+        'output_ptr': 'I',
+        'dtype': 'I'
+    }
+    layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
+    M = layer['M']
+    N = layer['N']
+    prec = PRECISION_T[layer['dtype']]
+
+    input = np.array(bytes_to_float(elf.get_symbol_contents('input'), prec), dtype=NUMPY_T[prec])
+    input = input.reshape(M, N)
+    input = torch.from_numpy(input)
+
+    # Verify results
+    output_actual = np.array(bytes_to_float(raw_results['output'], prec), dtype=NUMPY_T[prec])
+    output_golden = golden_model(input).detach().numpy().flatten()
+
+    absolute_err = np.absolute(output_golden - output_actual)
+    fail = np.any(absolute_err > ERR_THRESHOLD)
+    if (fail):
+        verification.dump_results_to_csv([output_golden, output_actual, absolute_err],
+                                         Path.cwd() / 'transpose_results.csv')
+
+    return int(fail)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile
@@ -14,6 +14,7 @@ SUBDIRS += dnn/batchnorm
 # SUBDIRS += dnn/fusedconv
 SUBDIRS += dnn/gelu
 SUBDIRS += dnn/gemm
+SUBDIRS += dnn/transpose
 SUBDIRS += dnn/layernorm
 SUBDIRS += dnn/linear
 SUBDIRS += dnn/maxpool

diff --git a/target/snitch_cluster/sw/apps/dnn/transpose/Makefile b/target/snitch_cluster/sw/apps/dnn/transpose/Makefile
@@ -0,0 +1,12 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <[email protected]>
+
+APP ?= transpose
+
+include ../../../../../../sw/dnn/common.mk
+include ../../common.mk
+
+$(DEP): $(DATA_H)