Skip to content

Commit

Permalink
dnn: Refactor and verify GeLU
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Nov 8, 2023
1 parent f3266ab commit f20ace0
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 102 deletions.
14 changes: 5 additions & 9 deletions sw/dnn/gelu/data/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,30 +35,26 @@


def golden_model(ifmap):
gelu = torch.nn.GELU()
gelu = torch.nn.GELU(approximate='tanh')
return gelu(ifmap)


def emit_header(**kwargs):

batch_size = kwargs['input_dim']['batch_size']
seq_len = kwargs['input_dim']['seq_len']
hidden_nodes = kwargs['input_dim']['hidden_nodes']
size = kwargs['size']
prec = str(kwargs['prec'])

torch_type = data_utils.floating_point_torch_type(prec)
ctype = data_utils.floating_point_ctype(prec)

ifmap = torch.randn(batch_size, seq_len, hidden_nodes, requires_grad=False, dtype=torch_type)
ifmap = torch.randn(size, requires_grad=False, dtype=torch_type)
ofmap = golden_model(ifmap)

ifmap_uid = 'ifmap'
ofmap_uid = 'ofmap'

layer_cfg = {
'batch_size': batch_size,
'seq_len': seq_len,
'hidden_nodes': hidden_nodes,
'size': size,
'ifmap': ifmap_uid,
'ofmap': ofmap_uid,
'dtype': PRECISION_T[prec]
Expand All @@ -82,7 +78,7 @@ def emit_header(**kwargs):

def main():

parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
parser = argparse.ArgumentParser()
parser.add_argument(
"-c", "--cfg",
type=pathlib.Path,
Expand Down
8 changes: 2 additions & 6 deletions sw/dnn/gelu/data/params.hjson
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@
// SPDX-License-Identifier: SHL-0.51

{
input_dim: {
batch_size: 3,
seq_len: 8,
hidden_nodes: 4
}
prec: 32
size: 128
prec: 64
}
126 changes: 41 additions & 85 deletions sw/dnn/gelu/src/gelu.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,116 +11,72 @@
* @struct gelu_layer_struct
* @brief This structure contains all parameters necessary
* for computing the GELU activation function
* @var gelu_layer_struct::batch_size
* Size of each input sample
* @var gelu_layer_struct::seq_len
* Size of each output sample
* @var gelu_layer_struct::hidden_nodes
* Number of hidden dimensions
* @var gelu_layer_struct::size
* Size of the feature map
* @var gelu_layer_struct::ifmap
* Pointer to input feature map
* @var gelu_layer_struct::ofmap
* Pointer to output feature map
*/
typedef struct gelu_layer_struct {
uint32_t batch_size;
uint32_t seq_len;
uint32_t hidden_nodes;
float *ifmap;
float *ofmap;
uint32_t size;
double *ifmap;
double *ofmap;
precision_t dtype;
} gelu_layer_t;

/**
* Implementation of the GELU layer
*/
static inline void gelu_fp32(float *input, float *output, int32_t ldI,
uint32_t batch_size, uint32_t seq_len,
uint32_t hidden_nodes) {
// uint32_t compute_id = snrt_cluster_compute_core_num();

for (int s = 0; s < seq_len; s++) {
for (int h = 0; h < hidden_nodes; h++) {
// if (compute_id == 1) {
// printf("compute id: %d, input[%d][%d] = %f\n", compute_id, s,
// h,
// input[s * hidden_nodes + h]);
// }
float x = input[s * hidden_nodes + h];
float y =
0.5 * x *
(1.0 + tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * x * x * x)));
output[s * hidden_nodes + h] = y;
// tanh based approximation of the GeLU activation function
static inline double gelu_activation_fp64(double x) {
return 0.5 * x *
(1.0 + tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * x * x * x)));
}

// if (compute_id == 1) {
// printf("compute id: %d, output[%d][%d] = %f\n", compute_id,
// s, h,
// output[s * hidden_nodes + h]);
// }
// Single-cluster GeLU
static inline void gelu_fp64(double *input, double *output, uint32_t size) {
if (snrt_is_compute_core()) {
for (uint32_t i = 0; i < size; i++) {
snrt_mcycle();
output[i] = gelu_activation_fp64(input[i]);
}
}
}

/**
* @brief GELU layer
*
* @param l gelu_layer_t struct that holds addresses and parameters
*
*/
static inline void gelu_layer(const gelu_layer_t *l) {
uint32_t cluster_num = snrt_cluster_num();
uint32_t cluster_id = snrt_cluster_idx();
uint32_t compute_num = snrt_cluster_compute_core_num();
uint32_t compute_id = snrt_cluster_compute_core_num();
// Parallel GeLU layer with DMA transfers
static inline void gelu_layer(const gelu_layer_t l) {
// Parallelize the computation over clusters
uint32_t cluster_fmap_size = l.size / snrt_cluster_num();
uint32_t cluster_fmap_bytes = cluster_fmap_size * sizeof(double);

uint32_t ifmap_size =
l->batch_size * l->seq_len * l->hidden_nodes * sizeof(float);
uint32_t ofmap_size = ifmap_size;
// Allocate memory in TCDM
void *ptr = (double *)snrt_l1_next();
double *l1_ifmap = ptr;
ptr += cluster_fmap_bytes;
double *l1_ofmap = ptr;
ptr += cluster_fmap_bytes;

void *ptr = (float *)snrt_l1_next();
float *ifmap = ptr;
ptr += ifmap_size;
float *ofmap = ptr;
ptr += ofmap_size;
// Get pointer to feature maps in L3
uint32_t cluster_offset = cluster_fmap_bytes * snrt_cluster_idx();
double *l3_ifmap = ((void *)l.ifmap) + cluster_offset;
double *l3_ofmap = ((void *)l.ofmap) + cluster_offset;

// DMA transfer the ifmap into the cluster TCDM
if (snrt_is_dm_core()) {
snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
ifmap, l->ifmap, l->batch_size * sizeof(float),
l->batch_size * sizeof(float), l->batch_size * sizeof(float),
l->seq_len * l->hidden_nodes * sizeof(float));

snrt_dma_start_1d(l1_ifmap, l3_ifmap, cluster_fmap_bytes);
snrt_dma_wait_all();
}

snrt_cluster_hw_barrier();

if (snrt_is_compute_core()) {
// determine the row offset for each core
int32_t row_offset = compute_id * l->hidden_nodes;

// determine the row stride of each matrix
int32_t ldI = compute_num * l->hidden_nodes;
// Cluster computation
gelu_fp64(l1_ifmap, l1_ofmap, cluster_fmap_size);

// determine the batch offset for each core
int32_t batch_offset = l->seq_len * l->hidden_nodes;

// printf("row_offset: %d, ldI: %d\n", row_offset, ldI);

for (int b = 0; b < l->batch_size; b++) {
// if (compute_id == 1) {
// printf("BATCH: %d\n", b);
// }
gelu_fp32(&ifmap[row_offset + b * batch_offset],
&ofmap[row_offset + b * batch_offset], ldI, l->batch_size,
l->seq_len / 8, l->hidden_nodes);
}

snrt_cluster_hw_barrier();
snrt_cluster_hw_barrier();

} else {
snrt_cluster_hw_barrier();
// DMA transfer the ofmap to DRAM
if (snrt_is_dm_core()) {
snrt_dma_start_1d(l3_ofmap, l1_ofmap, cluster_fmap_bytes);
snrt_dma_wait_all();
}

snrt_global_barrier();
}
snrt_cluster_hw_barrier();
}
2 changes: 1 addition & 1 deletion sw/dnn/gelu/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@

#include "data.h"

int main() { gelu_layer(&layer); }
int main() { gelu_layer(layer); }
79 changes: 79 additions & 0 deletions sw/dnn/gelu/verify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Luca Colagrande <[email protected]>

import sys
from pathlib import Path
import numpy as np
import torch
from data.datagen import golden_model

sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
import verification # noqa: E402
from elf import Elf # noqa: E402
from data_utils import bytes_to_float, bytes_to_struct # noqa: E402


ERR_THRESHOLD = 1E-6

PRECISION_T = {
8: '64',
4: '32',
2: '16',
1: '8'
}

NUMPY_T = {
'64': np.float64,
'32': np.float32,
'16': np.float16
}


def main():
# Run simulation and get outputs
args = verification.parse_args()
raw_results = verification.simulate(sim_bin=args.sim_bin,
snitch_bin=args.snitch_bin,
symbols_bin=args.symbols_bin,
log=args.log,
output_uids=['ofmap'])

# Extract input operands from ELF file
if args.symbols_bin:
elf = Elf(args.symbols_bin)
else:
elf = Elf(args.snitch_bin)

layer_struct = {
'size': 'I',
'ifmap': 'I',
'ofmap': 'I',
'dtype': 'I'
}
layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
prec = PRECISION_T[layer['dtype']]

ifmap = np.array(bytes_to_float(elf.get_symbol_contents('ifmap'), prec), dtype=NUMPY_T[prec])
ifmap = torch.from_numpy(ifmap)

# Verify results
ofmap_actual = np.array(bytes_to_float(raw_results['ofmap'], prec), dtype=NUMPY_T[prec])
ofmap_golden = golden_model(ifmap).detach().numpy().flatten()
relative_err = np.absolute((ofmap_golden - ofmap_actual) / ofmap_golden)
fail = np.any(relative_err > ERR_THRESHOLD)

# Print results
if (fail):
verification.dump_results_to_csv([ofmap_golden, ofmap_actual, relative_err],
Path.cwd() / 'gelu_results.csv')
print('Maximum relative error:', np.max(relative_err))

return int(fail)


if __name__ == "__main__":
sys.exit(main())
3 changes: 2 additions & 1 deletion target/snitch_cluster/sw/run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ runs:
- elf: apps/dnn/gemm/build/gemm.elf
- elf: apps/dnn/layernorm/build/layernorm.elf # Illegal FDIV without FDIV unit
cmd: ../../sw/dnn/layernorm/verify.py {sim_bin} {elf}
# - elf: apps/dnn/gelu/build/gelu.elf # seems like it stalls
- elf: apps/dnn/gelu/build/gelu.elf
cmd: ../../sw/dnn/gelu/verify.py {sim_bin} {elf}
# - elf: apps/dnn/conv2d/build/conv2d.elf # fails with exit code 32
# - elf: apps/dnn/fusedconv/build/fusedconv.elf # fails newly
- elf: apps/dnn/softmax/build/softmax.elf # Illegal FDIV without FDIV unit
Expand Down

0 comments on commit f20ace0

Please sign in to comment.