Implement revisions

pulp-platform · Feb 8, 2024 · 33ee07f · 33ee07f
1 parent 546df1a
commit 33ee07f
Show file tree

Hide file tree

Showing 61 changed files with 354 additions and 1,478 deletions.
diff --git a/.clang-format b/.clang-format
@@ -5,4 +5,4 @@
 # The CI runs on `clang-format` version 10
 BasedOnStyle: Google
 IndentWidth: 4
-IncludeBlocks: Preserve
+IncludeBlocks: Preserve
diff --git a/.clang-format-ignore b/.clang-format-ignore
@@ -3,4 +3,4 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Ignore vendored third-party code
-./sw/math/*
+./sw/math/*
diff --git a/docs/rm/sim/data_utils.md b/docs/rm/sim/data_utils.md
@@ -0,0 +1 @@
+::: data_utils
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -54,6 +54,7 @@ nav:
           # - Solder: rm/solder.md
       - Software:
           - Simulation Utilities:
+              - data_utils: rm/sim/data_utils.md
               - sim_utils: rm/sim/sim_utils.md
               - rm/sim/Simulation.md
               - rm/sim/Simulator.md

diff --git a/sw/blas/axpy/verify.py b/sw/blas/axpy/verify.py
@@ -13,7 +13,7 @@
 sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
 import verification  # noqa: E402
 from elf import Elf  # noqa: E402
-from data_utils import bytes_to_float  # noqa: E402
+from data_utils import from_buffer  # noqa: E402
 
 
 ERR_THRESHOLD = 1E-10
@@ -27,16 +27,16 @@ def main():
                                         symbols_bin=args.symbols_bin,
                                         log=args.log,
                                         output_uids=['z'])
-    z_actual = np.array(bytes_to_float(raw_results['z'], prec='64'))
+    z_actual = from_buffer(raw_results['z'], 'double')
 
     # Extract input operands from ELF file
     if args.symbols_bin:
         elf = Elf(args.symbols_bin)
     else:
         elf = Elf(args.snitch_bin)
-    a = np.array(bytes_to_float(elf.get_symbol_contents('a'), prec='64'))
-    x = np.array(bytes_to_float(elf.get_symbol_contents('x'), prec='64'))
-    y = np.array(bytes_to_float(elf.get_symbol_contents('y'), prec='64'))
+    a = elf.from_symbol('a', 'double')
+    x = elf.from_symbol('x', 'double')
+    y = elf.from_symbol('y', 'double')
 
     # Verify results
     z_golden = golden_model(a, x, y)

diff --git a/sw/blas/gemm/Makefile b/sw/blas/gemm/Makefile
@@ -9,7 +9,7 @@ MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 DATA_DIR := $(realpath $(MK_DIR)/data)
 SRC_DIR  := $(realpath $(MK_DIR)/src)
 
-DATA_CFG ?= $(DATA_DIR)/params.hjson
+DATA_CFG ?= $(DATA_DIR)/params.json
 SECTION  ?=
 
 APP     ?= gemm

diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py
@@ -9,7 +9,7 @@
 import numpy as np
 import argparse
 import pathlib
-import hjson
+import json5
 import sys
 import os
 
@@ -121,7 +121,7 @@ def emit_header(**kwargs):
     data_str += [format_scalar_definition('uint32_t', 'k_tiles', kwargs['k_tiles'])]
     data_str += [format_scalar_definition('uint32_t', 'parallelize_m', kwargs['parallelize_m'])]
     data_str += [format_scalar_definition('uint32_t', 'parallelize_k', kwargs['parallelize_k'])]
-    data_str += [format_scalar_definition('uint32_t', 'baseline', baseline)]
+    data_str += [format_scalar_definition('uint32_t', 'baseline', int(baseline))]
     data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
     data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
@@ -157,7 +157,7 @@ def main():
 
     # Load param config file
     with args.cfg.open() as f:
-        param = hjson.loads(f.read())
+        param = json5.loads(f.read())
     param['section'] = args.section
 
     # Emit header file

diff --git a/sw/blas/gemm/data/params.hjson → sw/blas/gemm/data/params.json b/sw/blas/gemm/data/params.hjson → sw/blas/gemm/data/params.json
@@ -10,13 +10,13 @@
     K: 4,
     beta: 0,
     ta: false,
-    tb: true, // must be true for SIMD
+    tb: false, // must be true for SIMD
     prec: 64,
     expand: 0,
-    m_tiles: 2 // number of tiles in M dimension
-    k_tiles: 2 // number of tiles in K dimension
-    n_tiles: 2 // number of tiles in N dimension
-    parallelize_k: 0
-    parallelize_m: 1
-    baseline: 0
+    m_tiles: 2, // number of tiles in M dimension
+    k_tiles: 2, // number of tiles in K dimension
+    n_tiles: 2, // number of tiles in N dimension
+    parallelize_k: 0,
+    parallelize_m: 1,
+    baseline: true
 }
diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
@@ -236,8 +236,7 @@ void gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA,
     // Unrolling factor of most inner loop.
     // Should be at least as high as the FMA delay
     // for maximum utilization
-    // const uint32_t unroll = 8;
-    const uint32_t unroll = 4;
+    const uint32_t unroll = 8;
 
     // A is of size MxK, B is of size KxN, C is of size MxN
     // for (uint32_t m = 0; m < M; m++) {
@@ -307,30 +306,30 @@ void gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA,
                 c[1] = C[m * ldC + n + 1];
                 c[2] = C[m * ldC + n + 2];
                 c[3] = C[m * ldC + n + 3];
-                // c[4] = C[m * ldC + n + 4];
-                // c[5] = C[m * ldC + n + 5];
-                // c[6] = C[m * ldC + n + 6];
-                // c[7] = C[m * ldC + n + 7];
+                c[4] = C[m * ldC + n + 4];
+                c[5] = C[m * ldC + n + 5];
+                c[6] = C[m * ldC + n + 6];
+                c[7] = C[m * ldC + n + 7];
             } else {
                 c[0] = 0.0;
                 c[1] = 0.0;
                 c[2] = 0.0;
                 c[3] = 0.0;
-                // c[4] = 0.0;
-                // c[5] = 0.0;
-                // c[6] = 0.0;
-                // c[7] = 0.0;
+                c[4] = 0.0;
+                c[5] = 0.0;
+                c[6] = 0.0;
+                c[7] = 0.0;
             }
             asm volatile(
                 "frep.o %[n_frep], %[unroll], 0, 0 \n"
                 "fmadd.d %[c0], ft0, ft1, %[c0] \n"
                 "fmadd.d %[c1], ft0, ft1, %[c1] \n"
                 "fmadd.d %[c2], ft0, ft1, %[c2] \n"
                 "fmadd.d %[c3], ft0, ft1, %[c3] \n"
-                // "fmadd.d %[c4], ft0, ft1, %[c4] \n"
-                // "fmadd.d %[c5], ft0, ft1, %[c5] \n"
-                // "fmadd.d %[c6], ft0, ft1, %[c6] \n"
-                // "fmadd.d %[c7], ft0, ft1, %[c7] \n"
+                "fmadd.d %[c4], ft0, ft1, %[c4] \n"
+                "fmadd.d %[c5], ft0, ft1, %[c5] \n"
+                "fmadd.d %[c6], ft0, ft1, %[c6] \n"
+                "fmadd.d %[c7], ft0, ft1, %[c7] \n"
                 : [ c0 ] "+f"(c[0]), [ c1 ] "+f"(c[1]), [ c2 ] "+f"(c[2]),
                   [ c3 ] "+f"(c[3]), [ c4 ] "+f"(c[4]), [ c5 ] "+f"(c[5]),
                   [ c6 ] "+f"(c[6]), [ c7 ] "+f"(c[7])
@@ -342,10 +341,10 @@ void gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA,
             C[m * ldC + n + 1] = c[1];
             C[m * ldC + n + 2] = c[2];
             C[m * ldC + n + 3] = c[3];
-            // C[m * ldC + n + 4] = c[4];
-            // C[m * ldC + n + 5] = c[5];
-            // C[m * ldC + n + 6] = c[6];
-            // C[m * ldC + n + 7] = c[7];
+            C[m * ldC + n + 4] = c[4];
+            C[m * ldC + n + 5] = c[5];
+            C[m * ldC + n + 6] = c[6];
+            C[m * ldC + n + 7] = c[7];
             n += unroll;
         }
 

diff --git a/sw/blas/gemm/verify.py b/sw/blas/gemm/verify.py
@@ -13,7 +13,7 @@
 sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
 import verification  # noqa: E402
 from elf import Elf  # noqa: E402
-from data_utils import bytes_to_float, bytes_to_int, NUMPY_T  # noqa: E402
+from data_utils import from_buffer, ctype_from_precision_t  # noqa: E402
 
 
 ERR_THRESHOLD = 0.001
@@ -33,17 +33,15 @@ def main():
         elf = Elf(args.symbols_bin)
     else:
         elf = Elf(args.snitch_bin)
-    dtype_size = bytes_to_int(elf.get_symbol_contents('dtype_size'),
-                              prec='32', signedness='unsigned')[0]
-    prec = str(dtype_size*8)
-    a = np.array(bytes_to_float(elf.get_symbol_contents('a'), prec=prec))
-    b = np.array(bytes_to_float(elf.get_symbol_contents('b'), prec=prec))
-    c = np.array(bytes_to_float(elf.get_symbol_contents('c'), prec=prec))
-    beta = bytes_to_int(elf.get_symbol_contents('BETA'), prec='32', signedness='unsigned')[0]
-    m = bytes_to_int(elf.get_symbol_contents('M'), prec='32', signedness='unsigned')[0]
-    n = bytes_to_int(elf.get_symbol_contents('N'), prec='32', signedness='unsigned')[0]
-    k = bytes_to_int(elf.get_symbol_contents('K'), prec='32', signedness='unsigned')[0]
-    tb = bytes_to_int(elf.get_symbol_contents('TB'), prec='32', signedness='unsigned')[0]
+    prec = elf.from_symbol('dtype_size', 'uint32_t')[0]
+    a = elf.from_symbol('a', ctype_from_precision_t(prec))
+    b = elf.from_symbol('b', ctype_from_precision_t(prec))
+    c = elf.from_symbol('c', ctype_from_precision_t(prec))
+    beta = elf.from_symbol('BETA', 'uint32_t')[0]
+    m = elf.from_symbol('M', 'uint32_t')[0]
+    n = elf.from_symbol('N', 'uint32_t')[0]
+    k = elf.from_symbol('K', 'uint32_t')[0]
+    tb = elf.from_symbol('TB', 'uint32_t')[0]
     a = np.reshape(a, (m, k))
     if tb:
         b = np.reshape(b, (n, k))
@@ -53,7 +51,7 @@ def main():
     c = np.reshape(c, (m, n))
 
     # Verify results
-    c_actual = np.array(bytes_to_float(raw_results['c'], prec), dtype=NUMPY_T[prec])
+    c_actual = from_buffer(raw_results['c'], ctype_from_precision_t(prec))
     c_golden = golden_model(1, a, b, beta, c).flatten()
 
     absolute_err = np.absolute(c_golden - c_actual)

diff --git a/sw/dnn/README.md b/sw/dnn/README.md
@@ -19,10 +19,10 @@ There are currently a few tests for various layer types. Some additional informa
 - `net-batchnorm.c`: Implementation of a batchnorm layer with SSR streams (both read and write)
 - `net-conv2d.c`: Implementation and tiling of a 2D convolution that can be distributed to multiple clusters. The convolution is implemented as an `im2col` transformation (performed by 2D DMA transfers) + optimized GEMM. The memory layout of input and output feature map is Height x Width x Channels. The convolution is globally parallelized over output channels. Inside a cluster, the output pixels are distributed among the cores. There is an option to load the feature map from a different cluster instead of the main memory by setting `cluster2cluster` in the layer struct to `1`. Currently only `fp64` is implemented, but the data movement for `fp32` or lower precision SIMD should be analogously.
 - `net-gemm.c`: Testbench to benchmark the optimized GEMM implementation for different memory layouts, dimensions and precisions.
-- `net-fusedconv.c`: Implementation of a fused kernel with Conv2d + BatchNorm + ReLU. The interface of the kernel is compatible with DORY. Parameters of a tile can be specified in `data/fusedconv_param.hjson`. Supported paramters are input/output dimension, padding, kernel dimension & stride, flags for BatchNorm and ReLU. Further there are two additional specialized kernels 1) a CHW kernel for input layers with very few input channels, the output of this kernel is in the HWC layout again 2) A depthwise kernel
+- `net-fusedconv.c`: Implementation of a fused kernel with Conv2d + BatchNorm + ReLU. The interface of the kernel is compatible with DORY. Parameters of a tile can be specified in `data/fusedconv_param.json`. Supported paramters are input/output dimension, padding, kernel dimension & stride, flags for BatchNorm and ReLU. Further there are two additional specialized kernels 1) a CHW kernel for input layers with very few input channels, the output of this kernel is in the HWC layout again 2) A depthwise kernel
 
 ## Usage
-To run a specific benchmark, first configure the dimensions and the desired precision `data/app_params.hjson`.
+To run a specific benchmark, first configure the dimensions and the desired precision `data/app_params.json`.
 ```
 {
     kernel: "GEMM"

diff --git a/sw/dnn/batchnorm/data/datagen.py b/sw/dnn/batchnorm/data/datagen.py
@@ -9,7 +9,7 @@
 
 import argparse
 import pathlib
-import hjson
+import json5
 import sys
 import os
 import torch
@@ -26,13 +26,6 @@
 # the occurrence of these splits the data should be aligned to 4KB
 BURST_ALIGNMENT = 4096
 
-PRECISION_T = {
-    '64': 'FP64',
-    '32': 'FP32',
-    '16': 'FP16',
-    '8': 'FP8'
-}
-
 
 def golden_model(ifmap):
     n, ci, ih, iw = ifmap.shape
@@ -55,8 +48,8 @@ def emit_header(**kwargs):
     tile_ci = kwargs['tile_ci']
     prec = str(kwargs['prec'])
 
-    torch_type = data_utils.floating_point_torch_type(prec)
-    ctype = data_utils.floating_point_ctype(prec)
+    torch_type = data_utils.torch_type_from_precision_t(prec)
+    ctype = data_utils.ctype_from_precision_t(prec)
 
     ifmap = torch.randn(1, in_channels, in_height, in_width, requires_grad=False, dtype=torch_type)
     ofmap, gamma, beta = golden_model(ifmap)
@@ -124,7 +117,7 @@ def main():
 
     # Load param config file
     with args.cfg.open() as f:
-        param = hjson.loads(f.read())
+        param = json5.loads(f.read())
     param['section'] = args.section
 
     # Emit header file

diff --git a/sw/dnn/batchnorm/data/params.hjson → sw/dnn/batchnorm/data/params.json b/sw/dnn/batchnorm/data/params.hjson → sw/dnn/batchnorm/data/params.json
@@ -4,10 +4,10 @@
 
 {
     input_dim: {
-        channels: 32
+        channels: 32,
         height: 8,
         width: 8
-    }
-    tile_ci: 32
-    prec: 64
+    },
+    tile_ci: 32,
+    prec: "FP64"
 }
diff --git a/sw/dnn/common.mk b/sw/dnn/common.mk
@@ -11,7 +11,7 @@ DATA_DIR       := $(realpath $(MK_DIR)/$(APP)/data)
 SRC_DIR        := $(realpath $(MK_DIR)/$(APP)/src)
 COMMON_SRC_DIR := $(realpath $(MK_DIR)/src)
 
-DATA_CFG ?= $(DATA_DIR)/params.hjson
+DATA_CFG ?= $(DATA_DIR)/params.json
 SECTION  ?=
 
 SRCS    ?= $(realpath $(SRC_DIR)/main.c)

diff --git a/sw/dnn/concat/data/datagen.py b/sw/dnn/concat/data/datagen.py
@@ -8,7 +8,7 @@
 import argparse
 import numpy as np
 import pathlib
-import hjson
+import json5
 import sys
 import os
 import torch
@@ -25,13 +25,6 @@
 # the occurrence of these splits the data should be aligned to 4KB
 BURST_ALIGNMENT = 4096
 
-PRECISION = {
-    'FP64': '64',
-    'FP32': '32',
-    'FP16': '16',
-    'FP8': '8'
-}
-
 
 def golden_model(inputs):
     innermost_dim = len(inputs[0].shape) - 1
@@ -41,15 +34,15 @@ def golden_model(inputs):
 def emit_header(section, params):
     num_inputs = params['num_inputs']
     input_shape = params['input_shape']
-    prec = PRECISION[params['dtype']]
+    prec = params['dtype']
 
-    torch_type = data_utils.floating_point_torch_type(prec)
+    torch_type = data_utils.torch_type_from_precision_t(prec)
 
     inputs = [torch.rand(*input_shape, requires_grad=False, dtype=torch_type)
               for _ in range(num_inputs)]
     output = golden_model(inputs)
 
-    ctype = data_utils.floating_point_ctype(prec)
+    ctype = data_utils.ctype_from_precision_t(prec)
 
     layer_cfg = {
         **params,
@@ -94,7 +87,7 @@ def main():
 
     # Load param config file
     with args.cfg.open() as f:
-        param = hjson.loads(f.read())
+        param = json5.loads(f.read())
 
     # Emit header file
     with open(args.output, 'w') as f:

diff --git a/sw/dnn/concat/data/params.hjson → sw/dnn/concat/data/params.json b/sw/dnn/concat/data/params.hjson → sw/dnn/concat/data/params.json
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: SHL-0.51
 
 {
-    num_inputs: 1
-    input_shape: [32, 4]
-    dtype: FP64
+    num_inputs: 1,
+    input_shape: [32, 4],
+    dtype: "FP64"
 }