Skip to content

Commit

Permalink
Temporary DNN refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Oct 25, 2023
1 parent 5b937ad commit 1eb0e45
Show file tree
Hide file tree
Showing 22 changed files with 355 additions and 141 deletions.
13 changes: 6 additions & 7 deletions sw/blas/axpy/data/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
import os

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
from data_utils import format_scalar_definition, format_vector_definition, \
format_vector_declaration, format_ifdef_wrapper # noqa: E402
from data_utils import format_scalar_definition, format_array_definition, \
format_array_declaration, format_ifdef_wrapper # noqa: E402

MIN = -1000
MAX = +1000
Expand Down Expand Up @@ -47,16 +47,15 @@ def main():
a = np.random.uniform(MIN, MAX, 1)
x = np.random.uniform(MIN, MAX, length)
y = np.random.uniform(MIN, MAX, length)
z = np.zeros(length)
g = golden_model(a, x, y)

# Format header file
l_str = format_scalar_definition('const uint32_t', 'l', length)
a_str = format_scalar_definition('const double', 'a', a[0])
x_str = format_vector_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
y_str = format_vector_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
z_str = format_vector_declaration('double', 'z', z, alignment=BURST_ALIGNMENT, section=section)
g_str = format_vector_definition('double', 'g', g)
x_str = format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
y_str = format_array_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
z_str = format_array_declaration('double', 'z', [length], alignment=BURST_ALIGNMENT, section=section)
g_str = format_array_definition('double', 'g', g)
g_str = format_ifdef_wrapper('BIST', g_str)
f_str = '\n\n'.join([l_str, a_str, x_str, y_str, z_str, g_str])
f_str += '\n'
Expand Down
12 changes: 6 additions & 6 deletions sw/blas/gemm/data/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
from data_utils import emit_license, format_scalar_definition, \
format_vector_definition, format_ifdef_wrapper # noqa: E402
format_array_definition, format_ifdef_wrapper # noqa: E402


np.random.seed(42)
Expand Down Expand Up @@ -100,16 +100,16 @@ def emit_header(**kwargs):
data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])]
data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)]
data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])]
data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
alignment=BURST_ALIGNMENT, section=kwargs['section'])]
data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
alignment=BURST_ALIGNMENT, section=kwargs['section'])]
data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
alignment=BURST_ALIGNMENT, section=kwargs['section'])]
if kwargs['prec'] == 8:
result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten())
result_def = format_array_definition(C_TYPES['64'], 'result', result.flatten())
else:
result_def = format_vector_definition(C_TYPES[str(kwargs['prec'])],
result_def = format_array_definition(C_TYPES[str(kwargs['prec'])],
'result',
result.flatten())
data_str += [format_ifdef_wrapper('BIST', result_def)]
Expand Down
File renamed without changes.
31 changes: 31 additions & 0 deletions sw/dnn/common.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Luca Colagrande <[email protected]>

# Usage of absolute paths is required to externally include this Makefile
MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))

DATA_DIR := $(realpath $(MK_DIR)/$(APP)/data)
SRC_DIR := $(realpath $(MK_DIR)/$(APP)/src)
COMMON_SRC_DIR := $(realpath $(MK_DIR)/src)

DATA_CFG ?= $(DATA_DIR)/params.hjson
SECTION ?=

SRCS ?= $(realpath $(SRC_DIR)/main.c)
INCDIRS ?= $(DATA_DIR) $(SRC_DIR) $(COMMON_SRC_DIR)

DATAGEN_PY := $(DATA_DIR)/datagen.py
DATA_H := $(DATA_DIR)/data.h

$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
$< -c $(DATA_CFG) --section="$(SECTION)" > $@

.PHONY: clean-data clean

clean-data:
rm -f $(DATA_H)

clean: clean-data
File renamed without changes.
File renamed without changes.
File renamed without changes.
1 change: 1 addition & 0 deletions sw/dnn/layernorm/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/data.h
99 changes: 99 additions & 0 deletions sw/dnn/layernorm/data/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python3
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Luca Colagrande <[email protected]>

import argparse
import pathlib
import hjson
import sys
import os
import torch

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
from data_utils import emit_license, format_scalar_definition, \
format_struct_definition, format_array_definition, \
format_array_declaration, format_ifdef_wrapper # noqa: E402
import data_utils

torch.manual_seed(42)

# AXI splits bursts crossing 4KB address boundaries. To minimize
# the occurrence of these splits the data should be aligned to 4KB
BURST_ALIGNMENT = 4096


def golden_model(ifmap, eps, shape, prec):
dtype = data_utils.floating_point_torch_type(prec)
ln = torch.nn.LayerNorm(shape, eps=eps)#, dtype=dtype)
return ln(ifmap)


def emit_header(**kwargs):
batch_size = kwargs['input_dim']['batch_size']
seq_len = kwargs['input_dim']['seq_len']
embeddings = kwargs['input_dim']['embeddings']
eps = kwargs['eps']
prec = str(kwargs['prec'])

torch_type = data_utils.floating_point_torch_type(prec)
ifmap = torch.randn(batch_size, seq_len, embeddings, requires_grad=False, dtype=torch_type)

ofmap = golden_model(ifmap, eps, embeddings, prec)
ofmap = ofmap.detach().numpy()

ctype = data_utils.floating_point_ctype(prec)
checksum = torch.sum(ifmap, dim=-1)

ifmap_uid = 'ifmap'
ofmap_uid = 'ofmap'

layer_cfg = {
**kwargs['input_dim'],
'ifmap': ifmap_uid,
'ofmap': ofmap_uid
}

data_str = [emit_license()]
data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape,
alignment=BURST_ALIGNMENT)]
data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape,
alignment=BURST_ALIGNMENT)]
data_str += [format_struct_definition('layernorm_layer_t', 'layer', layer_cfg)]
data_str += [format_array_definition(ctype, ifmap_uid, ifmap,
alignment=BURST_ALIGNMENT)]
result_def = format_array_definition(ctype, 'golden', ofmap, alignment=BURST_ALIGNMENT)
data_str += [format_ifdef_wrapper('BIST', result_def)]
data_str = '\n\n'.join(data_str)

return data_str


def main():

parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
parser.add_argument(
"-c", "--cfg",
type=pathlib.Path,
required=True,
help='Select param config file kernel'
)
parser.add_argument(
'--section',
type=str,
help='Section to store matrices in')
args = parser.parse_args()

# Load param config file
with args.cfg.open() as f:
param = hjson.loads(f.read())
param['section'] = args.section

# Emit header file
print(emit_header(**param))


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
// Copyright 2020 ETH Zurich and University of Bologna.
// Copyright 2023 ETH Zurich and University of Bologna.
// Solderpad Hardware License, Version 0.51, see LICENSE for details.
// SPDX-License-Identifier: SHL-0.51

// Parameters for a single LayerNorm layer

{
kernel: "LayerNorm"
input_dim: {
batch_size: 1,
seq_len: 32,
Expand Down
85 changes: 22 additions & 63 deletions sw/dnn/src/layernorm.h → sw/dnn/layernorm/src/layernorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "math.h"
#include "snrt.h"
// #include "printf.h"
#include "utils.h"
#include "dnn.h"

// add dump function for layernorm
dump_float(ln, 5);
Expand All @@ -16,29 +16,24 @@ dump_float(ln, 5);
* @struct layernorm_layer_struct
* @brief This structure contains all parameters necessary
* for computing the LayerNorm activation function
* @var layernorm_layer_struct::BATCH_SIZE
* @var layernorm_layer_struct::batch_size
* Size of each input sample
* @var layernorm_layer_struct::SEQ_LEN
* @var layernorm_layer_struct::seq_len
* Size of each output sample
* @var layernorm_layer_struct::EMBEDDINGS
* @var layernorm_layer_struct::embeddings
* Number of hidden dimensions
* @var layernorm_layer_struct::ifmap
* Pointer to input feature map
* @var layernorm_layer_struct::ofmap
* Pointer to output feature map
* @var layernorm_layer_struct::result
* Pointer to the golden model output
*/
typedef struct layernorm_layer_struct {
uint32_t BATCH_SIZE;
uint32_t SEQ_LEN;
uint32_t EMBEDDINGS;
uint32_t EPS;

float *ifmap;
float *ofmap;
float *result;

uint32_t batch_size;
uint32_t seq_len;
uint32_t embeddings;
uint32_t eps;
void *ifmap;
void *ofmap;
precision_t dtype;
} layernorm_layer_t;

Expand Down Expand Up @@ -95,44 +90,8 @@ static inline void layernorm_fp32(float *input, float *output, int32_t ldI,
*/
static inline void transformer_layernorm_fp64(double *input, int32_t ldI,
int32_t seq_len, int32_t embeddings,
int32_t eps) {
double mean = 0.0; // max value of the current core
double var = 0.0; // sum of the exp values of the current core

uint32_t compute_id = snrt_global_core_idx();
uint32_t num_cores = snrt_cluster_compute_core_num();

for (int32_t s = 0; s < seq_len; s++) {
mean = 0.0;
var = 0.0;

for (int32_t i = 0; i < embeddings; i++) {
mean += input[s * ldI + i];
}
mean /= embeddings;

// printf("mean[%d] = %f\n", b, mean);

for (int32_t i = 0; i < embeddings; i++) {
var += (input[s * ldI + i] - mean) *
(input[s * ldI + i] - mean);
}
var /= embeddings;

// printf("var[%d] = %f\n", b, var);

// compute the shifted value of the current row
for (int32_t i = 0; i < embeddings; i++) {
input[s * ldI + i] =
(input[s * ldI + i] - mean) /
sqrtf(var + eps);
// printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i,
// output[s * ldI + i]);
// dump_ln(input[s * ldI + i]);
}
}

snrt_cluster_hw_barrier();
int32_t eps) {
layernorm_fp64(input, input, ldI, 0, 1, seq_len, embeddings, eps);
}


Expand Down Expand Up @@ -187,14 +146,14 @@ static inline void transformer_layernorm_fp32(float *input, int32_t ldI,
* @param l layernorm_layer struct that holds addresses and parameters
*
*/
static inline void layernorm_layer(const layernorm_layer_t *l) {
static inline void layernorm_layer(layernorm_layer_t l) {
uint32_t cluster_num = snrt_cluster_num();
uint32_t cluster_id = snrt_cluster_idx();
uint32_t compute_num = snrt_cluster_compute_core_num();
uint32_t compute_id = snrt_global_core_idx();

uint32_t ifmap_size =
l->BATCH_SIZE * l->SEQ_LEN * l->EMBEDDINGS * sizeof(float);
l.batch_size * l.seq_len * l.embeddings * sizeof(float);
uint32_t ofmap_size = ifmap_size;

void *ptr = (float *)snrt_l1_next();
Expand All @@ -206,9 +165,9 @@ static inline void layernorm_layer(const layernorm_layer_t *l) {
// DMA transfer the ifmap into the cluster TCDM
if (snrt_is_dm_core()) {
snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
l->SEQ_LEN * l->EMBEDDINGS * sizeof(float));
ifmap, l.ifmap, l.batch_size * sizeof(float),
l.batch_size * sizeof(float), l.batch_size * sizeof(float),
l.seq_len * l.embeddings * sizeof(float));

snrt_dma_wait_all();
}
Expand All @@ -217,18 +176,18 @@ static inline void layernorm_layer(const layernorm_layer_t *l) {

if (snrt_is_compute_core()) {
// determine the row offset for each core
int32_t row_offset = compute_id * l->EMBEDDINGS;
int32_t row_offset = compute_id * l.embeddings;

// determine the row stride of each matrix
int32_t ldI = compute_num * l->EMBEDDINGS;
int32_t ldI = compute_num * l.embeddings;

// determine the batch offset for each core
int32_t batch_offset = l->SEQ_LEN * l->EMBEDDINGS;
int32_t batch_offset = l.seq_len * l.embeddings;

// printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
layernorm_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI,
batch_offset, l->BATCH_SIZE, l->SEQ_LEN / 8,
l->EMBEDDINGS, l->EPS);
batch_offset, l.batch_size, l.seq_len / 8,
l.embeddings, l.eps);

} else {
snrt_cluster_hw_barrier();
Expand Down
Loading

0 comments on commit 1eb0e45

Please sign in to comment.