Skip to content

Commit

Permalink
sw: Add transpose layer for fp32 and fp64
Browse files Browse the repository at this point in the history
  • Loading branch information
fischeti committed Nov 7, 2023
1 parent ca4d1e3 commit e990aac
Show file tree
Hide file tree
Showing 8 changed files with 351 additions and 0 deletions.
1 change: 1 addition & 0 deletions sw/dnn/src/dnn.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,4 +204,5 @@ typedef struct network_single_cluster_t_ {
#include "../linear/src/linear.h"
#include "../maxpool/src/maxpool.h"
#include "../softmax/src/softmax.h"
#include "../transpose/src/transpose.h"
// #include "utils.h"
109 changes: 109 additions & 0 deletions sw/dnn/transpose/data/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python3
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Tim Fischer <[email protected]>
# Viviane Potocnik <[email protected]>
# Luca Colagrande <[email protected]>

import argparse
import pathlib
import hjson
import sys
import os
import torch

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
import data_utils # noqa: E402
from data_utils import emit_license, \
format_struct_definition, format_array_definition, \
format_array_declaration, format_ifdef_wrapper # noqa: E402

torch.manual_seed(42)

# AXI splits bursts crossing 4KB address boundaries. To minimize
# the occurrence of these splits the data should be aligned to 4KB
BURST_ALIGNMENT = 4096

PRECISION_T = {
'64': 'FP64',
'32': 'FP32',
'16': 'FP16',
'8': 'FP8'
}


def golden_model(input):
return input.t()


def emit_header(**kwargs):
M = kwargs['input_dim']['M']
N = kwargs['input_dim']['N']
prec = str(kwargs['prec'])

torch_type = data_utils.floating_point_torch_type(prec)
input = torch.randn(M, N, requires_grad=False, dtype=torch_type)

output = golden_model(input)
output = output.detach().numpy()

ctype = data_utils.floating_point_ctype(prec)

input_uid = 'input'
output_uid = 'output'

layer_cfg = {
**kwargs['input_dim'],
'input': input_uid,
'output': output_uid,
'dtype': PRECISION_T[prec]
}

data_str = [emit_license()]
data_str += [format_array_declaration(ctype, input_uid, input.shape,
alignment=BURST_ALIGNMENT)]
data_str += [format_array_declaration(ctype, output_uid, output.shape,
alignment=BURST_ALIGNMENT)]
data_str += [format_struct_definition('transpose2d_layer_t', 'layer', layer_cfg)]
data_str += [format_array_definition(ctype, input_uid, input,
alignment=BURST_ALIGNMENT)]
result_def = format_array_definition(ctype, 'golden', output, alignment=BURST_ALIGNMENT)
data_str += [format_ifdef_wrapper('BIST', result_def)]
data_str = '\n\n'.join(data_str)

return data_str


def main():

parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
parser.add_argument(
"-c", "--cfg",
type=pathlib.Path,
required=True,
help='Select param config file kernel'
)
parser.add_argument(
'--section',
type=str,
help='Section to store matrices in')
parser.add_argument(
'output',
type=pathlib.Path,
help='Path of the output header file')
args = parser.parse_args()

# Load param config file
with args.cfg.open() as f:
param = hjson.loads(f.read())
param['section'] = args.section

# Emit header file
with open(args.output, 'w') as f:
f.write(emit_header(**param))


if __name__ == '__main__':
main()
11 changes: 11 additions & 0 deletions sw/dnn/transpose/data/params.hjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Copyright 2020 ETH Zurich and University of Bologna.
// Solderpad Hardware License, Version 0.51, see LICENSE for details.
// SPDX-License-Identifier: SHL-0.51

{
input_dim: {
M: 64,
N: 64,
}
prec: 64
}
14 changes: 14 additions & 0 deletions sw/dnn/transpose/src/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Luca Colagrande <[email protected]>

#include "dnn.h"

#include "data.h"

int main() {
transpose2d_layer(layer);
return 0;
}
122 changes: 122 additions & 0 deletions sw/dnn/transpose/src/transpose.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Copyright 2020 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "math.h"
#include "snrt.h"

/**
* @struct transpose2d_layer_struct
* @brief This structure contains all parameters necessary
* for computing the Transpose2D of a matrix
* @var gemm_layer_struct::M
* First dimension of the matrix
* @var gemm_layer_struct::N
* Second dimension of the matrix
* @var transpose2d_layer_struct::input
* Pointer to input feature map
* @var transpose2d_layer_struct::output
* Pointer to output feature map
*/
typedef struct transpose2d_layer_struct {
uint32_t M;
uint32_t N;
void *input;
void *output;
precision_t dtype;
} transpose2d_layer_t;

/**
* @brief Implementation of the FP64 Transpose2D kernel
*
* @param input Pointer to input feature map
* @param output Pointer to output feature map
* @param M First dimension of the matrix
* @param N Second dimension of the matrix
*/
static inline void transposed2d_fp64(double* input, double* output, uint32_t M, uint32_t N, uint32_t M_stride) {
for (uint32_t m = 0; m < M; m++) {
for (uint32_t n = 0; n < N; n++) {
output[n * M_stride + m] = input[m * N + n];
}
}
}

/**
* @brief Implementation of the FP32 Transpose2D kernel
*
* @param input Pointer to input feature map
* @param output Pointer to output feature map
* @param M First dimension of the matrix
* @param N Second dimension of the matrix
*/
static inline void transposed2d_fp32(float* input, float* output, uint32_t M, uint32_t N, uint32_t M_stride) {
for (uint32_t m = 0; m < M; m++) {
for (uint32_t n = 0; n < N; n++) {
output[n * M_stride + m] = input[m * N + n];
}
}
}

/**
* @brief Transpose2D layer
*
* @param l transpose2D struct that holds addresses and parameters
*
*/
static inline void transpose2d_layer(transpose2d_layer_t const l) {
uint32_t cluster_num = snrt_cluster_num();
uint32_t cluster_id = snrt_cluster_idx();
uint32_t compute_num = snrt_cluster_compute_core_num();
uint32_t compute_id = snrt_global_core_idx();

uint32_t matrix_size = l.M * l.N;

void *ptr = snrt_l1_next();
void *input = ptr;
ptr += matrix_size * l.dtype;
void *output = ptr;
ptr += matrix_size * l.dtype;

// DMA transfer the matrix into the cluster TCDM
if (snrt_is_dm_core()) {
snrt_dma_start_1d(input, l.input, matrix_size * l.dtype);
snrt_dma_wait_all();
}

snrt_cluster_hw_barrier();

if (snrt_is_compute_core()) {
// determine the row offset for each core
int32_t row_offset = compute_id * (l.M / compute_num);

// calculate the input address offset
void* input_offset = input + row_offset * l.N * l.dtype;

// caluclate the output address offset
void* output_offset = output + row_offset * l.dtype;

switch(l.dtype) {
case FP32:
transposed2d_fp32(input_offset, output_offset, l.M / compute_num, l.N, l.M);
break;
case FP64:
transposed2d_fp64(input_offset, output_offset, l.M / compute_num, l.N, l.M);
break;
default:
break;
}
}

snrt_cluster_hw_barrier();

// DMA transfer the output to DRAM
if (snrt_is_dm_core()) {
snrt_dma_start_1d(l.output, output, matrix_size * l.dtype);
snrt_dma_wait_all();
}

snrt_global_barrier();
}
81 changes: 81 additions & 0 deletions sw/dnn/transpose/verify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python3
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Luca Colagrande <[email protected]>

import sys
from pathlib import Path
import numpy as np
import torch
from data.datagen import golden_model

sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
import verification # noqa: E402
from elf import Elf # noqa: E402
from data_utils import bytes_to_float, bytes_to_struct # noqa: E402


ERR_THRESHOLD = 0.003

PRECISION_T = {
8: '64',
4: '32',
2: '16',
1: '8'
}

NUMPY_T = {
'64': np.float64,
'32': np.float32,
'16': np.float16
}


def main():
# Run simulation and get outputs
args = verification.parse_args()
raw_results = verification.simulate(sim_bin=args.sim_bin,
snitch_bin=args.snitch_bin,
symbols_bin=args.symbols_bin,
log=args.log,
output_uids=['output'])

# Extract input operands from ELF file
if args.symbols_bin:
elf = Elf(args.symbols_bin)
else:
elf = Elf(args.snitch_bin)

layer_struct = {
'M': 'I',
'N': 'I',
'input_ptr': 'I',
'output_ptr': 'I',
'dtype': 'I'
}
layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
M = layer['M']
N = layer['N']
prec = PRECISION_T[layer['dtype']]

input = np.array(bytes_to_float(elf.get_symbol_contents('input'), prec), dtype=NUMPY_T[prec])
input = input.reshape(M, N)
input = torch.from_numpy(input)

# Verify results
output_actual = np.array(bytes_to_float(raw_results['output'], prec), dtype=NUMPY_T[prec])
output_golden = golden_model(input).detach().numpy().flatten()

absolute_err = np.absolute(output_golden - output_actual)
fail = np.any(absolute_err > ERR_THRESHOLD)
if (fail):
verification.dump_results_to_csv([output_golden, output_actual, absolute_err],
Path.cwd() / 'transpose_results.csv')

return int(fail)


if __name__ == "__main__":
sys.exit(main())
1 change: 1 addition & 0 deletions target/snitch_cluster/sw/apps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ SUBDIRS += dnn/batchnorm
# SUBDIRS += dnn/fusedconv
SUBDIRS += dnn/gelu
SUBDIRS += dnn/gemm
SUBDIRS += dnn/transpose
SUBDIRS += dnn/layernorm
SUBDIRS += dnn/linear
SUBDIRS += dnn/maxpool
Expand Down
12 changes: 12 additions & 0 deletions target/snitch_cluster/sw/apps/dnn/transpose/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Luca Colagrande <[email protected]>

APP ?= transpose

include ../../../../../../sw/dnn/common.mk
include ../../common.mk

$(DEP): $(DATA_H)

0 comments on commit e990aac

Please sign in to comment.