Skip to content

Commit

Permalink
Implement revisions
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Feb 8, 2024
1 parent 546df1a commit 33ee07f
Show file tree
Hide file tree
Showing 61 changed files with 354 additions and 1,478 deletions.
2 changes: 1 addition & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
# The CI runs on `clang-format` version 10
BasedOnStyle: Google
IndentWidth: 4
IncludeBlocks: Preserve
IncludeBlocks: Preserve
2 changes: 1 addition & 1 deletion .clang-format-ignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# SPDX-License-Identifier: Apache-2.0

# Ignore vendored third-party code
./sw/math/*
./sw/math/*
1 change: 1 addition & 0 deletions docs/rm/sim/data_utils.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
::: data_utils
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ nav:
# - Solder: rm/solder.md
- Software:
- Simulation Utilities:
- data_utils: rm/sim/data_utils.md
- sim_utils: rm/sim/sim_utils.md
- rm/sim/Simulation.md
- rm/sim/Simulator.md
Expand Down
10 changes: 5 additions & 5 deletions sw/blas/axpy/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
import verification # noqa: E402
from elf import Elf # noqa: E402
from data_utils import bytes_to_float # noqa: E402
from data_utils import from_buffer # noqa: E402


ERR_THRESHOLD = 1E-10
Expand All @@ -27,16 +27,16 @@ def main():
symbols_bin=args.symbols_bin,
log=args.log,
output_uids=['z'])
z_actual = np.array(bytes_to_float(raw_results['z'], prec='64'))
z_actual = from_buffer(raw_results['z'], 'double')

# Extract input operands from ELF file
if args.symbols_bin:
elf = Elf(args.symbols_bin)
else:
elf = Elf(args.snitch_bin)
a = np.array(bytes_to_float(elf.get_symbol_contents('a'), prec='64'))
x = np.array(bytes_to_float(elf.get_symbol_contents('x'), prec='64'))
y = np.array(bytes_to_float(elf.get_symbol_contents('y'), prec='64'))
a = elf.from_symbol('a', 'double')
x = elf.from_symbol('x', 'double')
y = elf.from_symbol('y', 'double')

# Verify results
z_golden = golden_model(a, x, y)
Expand Down
2 changes: 1 addition & 1 deletion sw/blas/gemm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
DATA_DIR := $(realpath $(MK_DIR)/data)
SRC_DIR := $(realpath $(MK_DIR)/src)

DATA_CFG ?= $(DATA_DIR)/params.hjson
DATA_CFG ?= $(DATA_DIR)/params.json
SECTION ?=

APP ?= gemm
Expand Down
6 changes: 3 additions & 3 deletions sw/blas/gemm/data/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
import argparse
import pathlib
import hjson
import json5
import sys
import os

Expand Down Expand Up @@ -121,7 +121,7 @@ def emit_header(**kwargs):
data_str += [format_scalar_definition('uint32_t', 'k_tiles', kwargs['k_tiles'])]
data_str += [format_scalar_definition('uint32_t', 'parallelize_m', kwargs['parallelize_m'])]
data_str += [format_scalar_definition('uint32_t', 'parallelize_k', kwargs['parallelize_k'])]
data_str += [format_scalar_definition('uint32_t', 'baseline', baseline)]
data_str += [format_scalar_definition('uint32_t', 'baseline', int(baseline))]
data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
alignment=BURST_ALIGNMENT, section=kwargs['section'])]
data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
Expand Down Expand Up @@ -157,7 +157,7 @@ def main():

# Load param config file
with args.cfg.open() as f:
param = hjson.loads(f.read())
param = json5.loads(f.read())
param['section'] = args.section

# Emit header file
Expand Down
14 changes: 7 additions & 7 deletions sw/blas/gemm/data/params.hjson → sw/blas/gemm/data/params.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
K: 4,
beta: 0,
ta: false,
tb: true, // must be true for SIMD
tb: false, // must be true for SIMD
prec: 64,
expand: 0,
m_tiles: 2 // number of tiles in M dimension
k_tiles: 2 // number of tiles in K dimension
n_tiles: 2 // number of tiles in N dimension
parallelize_k: 0
parallelize_m: 1
baseline: 0
m_tiles: 2, // number of tiles in M dimension
k_tiles: 2, // number of tiles in K dimension
n_tiles: 2, // number of tiles in N dimension
parallelize_k: 0,
parallelize_m: 1,
baseline: true
}
35 changes: 17 additions & 18 deletions sw/blas/gemm/src/gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,7 @@ void gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA,
// Unrolling factor of most inner loop.
// Should be at least as high as the FMA delay
// for maximum utilization
// const uint32_t unroll = 8;
const uint32_t unroll = 4;
const uint32_t unroll = 8;

// A is of size MxK, B is of size KxN, C is of size MxN
// for (uint32_t m = 0; m < M; m++) {
Expand Down Expand Up @@ -307,30 +306,30 @@ void gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA,
c[1] = C[m * ldC + n + 1];
c[2] = C[m * ldC + n + 2];
c[3] = C[m * ldC + n + 3];
// c[4] = C[m * ldC + n + 4];
// c[5] = C[m * ldC + n + 5];
// c[6] = C[m * ldC + n + 6];
// c[7] = C[m * ldC + n + 7];
c[4] = C[m * ldC + n + 4];
c[5] = C[m * ldC + n + 5];
c[6] = C[m * ldC + n + 6];
c[7] = C[m * ldC + n + 7];
} else {
c[0] = 0.0;
c[1] = 0.0;
c[2] = 0.0;
c[3] = 0.0;
// c[4] = 0.0;
// c[5] = 0.0;
// c[6] = 0.0;
// c[7] = 0.0;
c[4] = 0.0;
c[5] = 0.0;
c[6] = 0.0;
c[7] = 0.0;
}
asm volatile(
"frep.o %[n_frep], %[unroll], 0, 0 \n"
"fmadd.d %[c0], ft0, ft1, %[c0] \n"
"fmadd.d %[c1], ft0, ft1, %[c1] \n"
"fmadd.d %[c2], ft0, ft1, %[c2] \n"
"fmadd.d %[c3], ft0, ft1, %[c3] \n"
// "fmadd.d %[c4], ft0, ft1, %[c4] \n"
// "fmadd.d %[c5], ft0, ft1, %[c5] \n"
// "fmadd.d %[c6], ft0, ft1, %[c6] \n"
// "fmadd.d %[c7], ft0, ft1, %[c7] \n"
"fmadd.d %[c4], ft0, ft1, %[c4] \n"
"fmadd.d %[c5], ft0, ft1, %[c5] \n"
"fmadd.d %[c6], ft0, ft1, %[c6] \n"
"fmadd.d %[c7], ft0, ft1, %[c7] \n"
: [ c0 ] "+f"(c[0]), [ c1 ] "+f"(c[1]), [ c2 ] "+f"(c[2]),
[ c3 ] "+f"(c[3]), [ c4 ] "+f"(c[4]), [ c5 ] "+f"(c[5]),
[ c6 ] "+f"(c[6]), [ c7 ] "+f"(c[7])
Expand All @@ -342,10 +341,10 @@ void gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA,
C[m * ldC + n + 1] = c[1];
C[m * ldC + n + 2] = c[2];
C[m * ldC + n + 3] = c[3];
// C[m * ldC + n + 4] = c[4];
// C[m * ldC + n + 5] = c[5];
// C[m * ldC + n + 6] = c[6];
// C[m * ldC + n + 7] = c[7];
C[m * ldC + n + 4] = c[4];
C[m * ldC + n + 5] = c[5];
C[m * ldC + n + 6] = c[6];
C[m * ldC + n + 7] = c[7];
n += unroll;
}

Expand Down
24 changes: 11 additions & 13 deletions sw/blas/gemm/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
import verification # noqa: E402
from elf import Elf # noqa: E402
from data_utils import bytes_to_float, bytes_to_int, NUMPY_T # noqa: E402
from data_utils import from_buffer, ctype_from_precision_t # noqa: E402


ERR_THRESHOLD = 0.001
Expand All @@ -33,17 +33,15 @@ def main():
elf = Elf(args.symbols_bin)
else:
elf = Elf(args.snitch_bin)
dtype_size = bytes_to_int(elf.get_symbol_contents('dtype_size'),
prec='32', signedness='unsigned')[0]
prec = str(dtype_size*8)
a = np.array(bytes_to_float(elf.get_symbol_contents('a'), prec=prec))
b = np.array(bytes_to_float(elf.get_symbol_contents('b'), prec=prec))
c = np.array(bytes_to_float(elf.get_symbol_contents('c'), prec=prec))
beta = bytes_to_int(elf.get_symbol_contents('BETA'), prec='32', signedness='unsigned')[0]
m = bytes_to_int(elf.get_symbol_contents('M'), prec='32', signedness='unsigned')[0]
n = bytes_to_int(elf.get_symbol_contents('N'), prec='32', signedness='unsigned')[0]
k = bytes_to_int(elf.get_symbol_contents('K'), prec='32', signedness='unsigned')[0]
tb = bytes_to_int(elf.get_symbol_contents('TB'), prec='32', signedness='unsigned')[0]
prec = elf.from_symbol('dtype_size', 'uint32_t')[0]
a = elf.from_symbol('a', ctype_from_precision_t(prec))
b = elf.from_symbol('b', ctype_from_precision_t(prec))
c = elf.from_symbol('c', ctype_from_precision_t(prec))
beta = elf.from_symbol('BETA', 'uint32_t')[0]
m = elf.from_symbol('M', 'uint32_t')[0]
n = elf.from_symbol('N', 'uint32_t')[0]
k = elf.from_symbol('K', 'uint32_t')[0]
tb = elf.from_symbol('TB', 'uint32_t')[0]
a = np.reshape(a, (m, k))
if tb:
b = np.reshape(b, (n, k))
Expand All @@ -53,7 +51,7 @@ def main():
c = np.reshape(c, (m, n))

# Verify results
c_actual = np.array(bytes_to_float(raw_results['c'], prec), dtype=NUMPY_T[prec])
c_actual = from_buffer(raw_results['c'], ctype_from_precision_t(prec))
c_golden = golden_model(1, a, b, beta, c).flatten()

absolute_err = np.absolute(c_golden - c_actual)
Expand Down
4 changes: 2 additions & 2 deletions sw/dnn/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ There are currently a few tests for various layer types. Some additional informa
- `net-batchnorm.c`: Implementation of a batchnorm layer with SSR streams (both read and write)
- `net-conv2d.c`: Implementation and tiling of a 2D convolution that can be distributed to multiple clusters. The convolution is implemented as an `im2col` transformation (performed by 2D DMA transfers) + optimized GEMM. The memory layout of input and output feature map is Height x Width x Channels. The convolution is globally parallelized over output channels. Inside a cluster, the output pixels are distributed among the cores. There is an option to load the feature map from a different cluster instead of the main memory by setting `cluster2cluster` in the layer struct to `1`. Currently only `fp64` is implemented, but the data movement for `fp32` or lower precision SIMD should be analogously.
- `net-gemm.c`: Testbench to benchmark the optimized GEMM implementation for different memory layouts, dimensions and precisions.
- `net-fusedconv.c`: Implementation of a fused kernel with Conv2d + BatchNorm + ReLU. The interface of the kernel is compatible with DORY. Parameters of a tile can be specified in `data/fusedconv_param.hjson`. Supported paramters are input/output dimension, padding, kernel dimension & stride, flags for BatchNorm and ReLU. Further there are two additional specialized kernels 1) a CHW kernel for input layers with very few input channels, the output of this kernel is in the HWC layout again 2) A depthwise kernel
- `net-fusedconv.c`: Implementation of a fused kernel with Conv2d + BatchNorm + ReLU. The interface of the kernel is compatible with DORY. Parameters of a tile can be specified in `data/fusedconv_param.json`. Supported paramters are input/output dimension, padding, kernel dimension & stride, flags for BatchNorm and ReLU. Further there are two additional specialized kernels 1) a CHW kernel for input layers with very few input channels, the output of this kernel is in the HWC layout again 2) A depthwise kernel

## Usage
To run a specific benchmark, first configure the dimensions and the desired precision `data/app_params.hjson`.
To run a specific benchmark, first configure the dimensions and the desired precision `data/app_params.json`.
```
{
kernel: "GEMM"
Expand Down
15 changes: 4 additions & 11 deletions sw/dnn/batchnorm/data/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import argparse
import pathlib
import hjson
import json5
import sys
import os
import torch
Expand All @@ -26,13 +26,6 @@
# the occurrence of these splits the data should be aligned to 4KB
BURST_ALIGNMENT = 4096

PRECISION_T = {
'64': 'FP64',
'32': 'FP32',
'16': 'FP16',
'8': 'FP8'
}


def golden_model(ifmap):
n, ci, ih, iw = ifmap.shape
Expand All @@ -55,8 +48,8 @@ def emit_header(**kwargs):
tile_ci = kwargs['tile_ci']
prec = str(kwargs['prec'])

torch_type = data_utils.floating_point_torch_type(prec)
ctype = data_utils.floating_point_ctype(prec)
torch_type = data_utils.torch_type_from_precision_t(prec)
ctype = data_utils.ctype_from_precision_t(prec)

ifmap = torch.randn(1, in_channels, in_height, in_width, requires_grad=False, dtype=torch_type)
ofmap, gamma, beta = golden_model(ifmap)
Expand Down Expand Up @@ -124,7 +117,7 @@ def main():

# Load param config file
with args.cfg.open() as f:
param = hjson.loads(f.read())
param = json5.loads(f.read())
param['section'] = args.section

# Emit header file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

{
input_dim: {
channels: 32
channels: 32,
height: 8,
width: 8
}
tile_ci: 32
prec: 64
},
tile_ci: 32,
prec: "FP64"
}
2 changes: 1 addition & 1 deletion sw/dnn/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ DATA_DIR := $(realpath $(MK_DIR)/$(APP)/data)
SRC_DIR := $(realpath $(MK_DIR)/$(APP)/src)
COMMON_SRC_DIR := $(realpath $(MK_DIR)/src)

DATA_CFG ?= $(DATA_DIR)/params.hjson
DATA_CFG ?= $(DATA_DIR)/params.json
SECTION ?=

SRCS ?= $(realpath $(SRC_DIR)/main.c)
Expand Down
17 changes: 5 additions & 12 deletions sw/dnn/concat/data/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import argparse
import numpy as np
import pathlib
import hjson
import json5
import sys
import os
import torch
Expand All @@ -25,13 +25,6 @@
# the occurrence of these splits the data should be aligned to 4KB
BURST_ALIGNMENT = 4096

PRECISION = {
'FP64': '64',
'FP32': '32',
'FP16': '16',
'FP8': '8'
}


def golden_model(inputs):
innermost_dim = len(inputs[0].shape) - 1
Expand All @@ -41,15 +34,15 @@ def golden_model(inputs):
def emit_header(section, params):
num_inputs = params['num_inputs']
input_shape = params['input_shape']
prec = PRECISION[params['dtype']]
prec = params['dtype']

torch_type = data_utils.floating_point_torch_type(prec)
torch_type = data_utils.torch_type_from_precision_t(prec)

inputs = [torch.rand(*input_shape, requires_grad=False, dtype=torch_type)
for _ in range(num_inputs)]
output = golden_model(inputs)

ctype = data_utils.floating_point_ctype(prec)
ctype = data_utils.ctype_from_precision_t(prec)

layer_cfg = {
**params,
Expand Down Expand Up @@ -94,7 +87,7 @@ def main():

# Load param config file
with args.cfg.open() as f:
param = hjson.loads(f.read())
param = json5.loads(f.read())

# Emit header file
with open(args.output, 'w') as f:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// SPDX-License-Identifier: SHL-0.51

{
num_inputs: 1
input_shape: [32, 4]
dtype: FP64
num_inputs: 1,
input_shape: [32, 4],
dtype: "FP64"
}
Loading

0 comments on commit 33ee07f

Please sign in to comment.