Add tiled simple mult example (pulp-platform#33)

* sw: Copy simple mac example and refactor * sw: Refactor accelerator setup * sw: Add function for launching accelerator * sw: Refactor correctness check * sw: Working multiple SNAX runs * sw: Added duplicate runs * sw: Make duplicate runs in for loop * sw: Fix lint for code * sw: Move snax_mac_sw_clear to sw_barrier * sw: Tile execution of MAC operation * sw: Add data in to tiled implementation * sw: Add output transfer to double buffering loop * sw: Add license headers * sw: Fix linting errors * sw: Add tiled mac to makefile and runfile * sw: Add datagen * sw: Fix lint * sw: Resolve minor comments * sw: Add datagen.py to Makefile * sw: Fix lint * sw: Add performance coutners outside tiling loop * sw: Fix lint --------- Co-authored-by: rgantonio <[email protected]>
JosseVanDelm · Dec 11, 2023 · a4ab575 · a4ab575
1 parent f4d123f
commit a4ab575
Show file tree

Hide file tree

Showing 6 changed files with 255 additions and 0 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -90,6 +90,7 @@ jobs:
             hw/snax_gemm/src/*
             target/snitch_cluster/sw/apps/snax-mac/*
             target/snitch_cluster/sw/apps/snax-mac-simple/*
+            target/snitch_cluster/sw/apps/snax-mac-tiled/*
             target/snitch_cluster/sw/apps/snax-gemm-base/*
             target/snitch_cluster/sw/apps/snax-gemm-engine/*
 

diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile
@@ -20,6 +20,7 @@ SUBDIRS += dnn/softmax
 SUBDIRS += montecarlo/pi_estimation
 SUBDIRS += snax-mac
 SUBDIRS += snax-mac-simple
+SUBDIRS += snax-mac-tiled
 SUBDIRS += snax-gemm-engine
 SUBDIRS += snax-gemm-base
 

diff --git a/target/snitch_cluster/sw/apps/snax-mac-run.yaml b/target/snitch_cluster/sw/apps/snax-mac-run.yaml
@@ -12,6 +12,7 @@ runs:
   - app: dnn/gemm
   - app: snax-mac
   - app: snax-mac-simple
+  - app: snax-mac-tiled
 # dnn/gelu # seems like it stalls
 # dnn/conv2d # fails with exit code 32
 # dnn/fusedconv # fails newly

diff --git a/target/snitch_cluster/sw/apps/snax-mac-tiled/Makefile b/target/snitch_cluster/sw/apps/snax-mac-tiled/Makefile
@@ -0,0 +1,35 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Ryan Antonio <[email protected]>
+# Josse Van Delm <[email protected]>
+
+PYTHON = python3
+MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+DATA_DIR := $(realpath $(MK_DIR)/data)
+SRC_DIR  := $(realpath $(MK_DIR)/src)
+DATAGEN_PY = $(DATA_DIR)/datagen.py
+DATA_H     = $(DATA_DIR)/data.h
+
+LENGTH ?= 512
+TILE_SIZE ?= 32
+
+APP     ?= snax-mac-tiled
+SRCS    ?= $(SRC_DIR)/snax-mac-tiled.c
+INCDIRS += $(DATA_DIR) $(SRC_DIR)
+
+$(DATA_H): $(DATAGEN_PY)
+	$< --length=$(LENGTH) --tile_size=$(TILE_SIZE) > $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_H)
+
+clean: clean-data
+
+include ../common.mk
+
+$(DEP): $(DATA_H)
+
diff --git a/target/snitch_cluster/sw/apps/snax-mac-tiled/data/datagen.py b/target/snitch_cluster/sw/apps/snax-mac-tiled/data/datagen.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+import sys
+import argparse
+import numpy as np
+import os
+
+# Add data utility path
+sys.path.append(os.path.join(os.path.dirname(__file__),
+                "../../../../../../util/sim/"))
+from data_utils import format_scalar_definition, \
+                       format_vector_definition  # noqa: E402
+
+# Hard parameters
+MIN = 0
+MAX = 100
+
+
+def golden_model(a, b):
+    return a*b
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--length',
+        type=int,
+        help='Vector length of the input and output vectors')
+    parser.add_argument(
+        '--tile_size',
+        type=int,
+        help='Tile size length. vector_length/tile_size has to be an integer number!')
+    args = parser.parse_args()
+    tile_size = args.tile_size
+    length = args.length
+    assert length % tile_size == 0, "vector_length/tile_size has to be an integer number!"
+
+    # Randomly generate inputs
+    a = np.random.randint(MIN, MAX, length)
+    b = np.random.randint(MIN, MAX, length)
+    out = golden_model(a, b)
+    out_test = np.zeros(length, dtype=int)
+
+    # Format header file
+    l_str = format_scalar_definition('uint32_t', 'VEC_LEN', length)
+    t_str = format_scalar_definition('uint32_t', 'TILE_SIZE', tile_size)
+    a_str = format_vector_definition('uint32_t', 'A', a)
+    b_str = format_vector_definition('uint32_t', 'B', b)
+    out_str = format_vector_definition('uint32_t', 'OUT', out)
+    out_test_str = format_vector_definition('uint32_t', 'OUT_TEST', out_test)
+    f_str = '\n\n'.join([l_str, t_str, a_str, b_str, out_str, out_test_str])
+    f_str += '\n'
+
+    # Write to stdout
+    print(f_str)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/target/snitch_cluster/sw/apps/snax-mac-tiled/src/snax-mac-tiled.c b/target/snitch_cluster/sw/apps/snax-mac-tiled/src/snax-mac-tiled.c
@@ -0,0 +1,159 @@
+// Copyright 2023 KU Leuven
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Josse Van Delm <[email protected]>
+// Ryan Antonio <[email protected]>
+
+#include "snrt.h"
+
+#include "data.h"
+
+// * mac_mode = 0
+//   performs multiply-accumulate over elements to perform dot product
+// * simple_mult_mode = 1
+//   performs simple elementwise multiplication
+enum mode { mac_mode, simple_mult_mode };
+
+void snax_mac_launch() {
+    // Write start CSR to launch accelerator
+    write_csr(0x3c0, 0);
+}
+
+void snax_mac_sw_clear() {
+    // write 0x3c5 to clear HWPE accelerator
+    // Otherwise the accelerator goes into undefined behaviour:
+    // It might stall/continue indefinitely
+    write_csr(0x3c5, 0);
+    asm volatile("nop\n");
+    asm volatile("nop\n");
+    asm volatile("nop\n");
+}
+
+void snax_mac_sw_barrier() {
+    // poll csr 0x3c3 until HWPE MAC accelerator is finished
+    while (read_csr(0x3c3)) {
+    };
+    // This is necessary for the HWPE MAC accelerator to allow multiple runs
+    snax_mac_sw_clear();
+}
+
+void snax_mac_setup_simple_mult(uint32_t* a, uint32_t* b, uint32_t* o,
+                                uint32_t vector_length) {
+    /* Setup the hwpe_mac accelerator in simple_mult mode.
+     * This computes the product A*B in 32 bits and stores it starting
+     * from the pointer given by o
+     * args:
+     *  a: pointer in TCDM (L1) to vector A
+     *  b: pointer in TCDM (L1) to vector B
+     *  o: pointer in TCDM (L1) to where output O must be stored
+     *  vector_length: length of A,B and O
+     * */
+
+    // Set addresses
+    write_csr(0x3d0, (uint32_t)a);
+    write_csr(0x3d1, (uint32_t)b);
+    write_csr(0x3d3, (uint32_t)o);
+
+    // Set configs
+    write_csr(0x3d4, 1);                 // Number of iterations
+    write_csr(0x3d5, vector_length);     // Vector length
+    write_csr(0x3d6, simple_mult_mode);  // Set simple multiplication
+}
+
+void cpu_simple_mult(uint32_t* a, uint32_t* b, uint32_t* o,
+                     uint32_t vector_length) {
+    for (uint32_t i = 0; i < vector_length; i++) {
+        o[i] = a[i] * b[i];
+    };
+}
+
+int check_simple_mult(uint32_t* output, uint32_t* output_golden,
+                      uint32_t vector_length) {
+    /*
+     * Compare output to output_golden with length vector_length
+     */
+    uint32_t err = 0;
+    for (uint32_t i = 0; i < vector_length; i++) {
+        // Check if output is same as golden output
+        if (output[i] != output_golden[i]) {
+            err++;
+        };
+    };
+    return err;
+}
+
+int main() {
+    uint32_t *local_a, *local_b;
+    uint32_t* local_o;
+
+    // Allocate space in TCDM
+    local_a = (uint32_t*)snrt_l1_next();
+    local_b = local_a + VEC_LEN;
+    local_o = local_b + VEC_LEN;
+
+    uint32_t tile_size = TILE_SIZE;
+    // Warning: Manually make sure this is an integer number!
+    uint32_t iterations = VEC_LEN / tile_size;
+    size_t transfer_size = tile_size * sizeof(uint32_t);
+    // Main tiling loop
+    // I:
+    // | (0) | (1) | (2) | (3) | (4) | (5) |
+    // Phase:
+    // |in---| cal |--out|
+    //       |in---| cal |  out|
+    //             |in---| cal |--out|
+    //                   |in---| cal |--out|
+    //
+    // Note:"in" and "out" can technically not execute at the same time,
+    // so they are "waiting" for each other to release the DMA.
+    //
+    // Add + 2 to iterations for end of pipeline
+    uint32_t cycles_pre_loop = snrt_mcycle();
+    for (uint32_t i = 0; i < iterations + 2; i++) {
+        // Load in data: not in last two iterations
+        if (snrt_is_dm_core() && i < iterations) {
+            // Use data mover core to bring data from L3 to TCDM
+            snrt_dma_start_1d(local_a + i * tile_size, A + i * tile_size,
+                              transfer_size);
+            snrt_dma_start_1d(local_b + i * tile_size, B + i * tile_size,
+                              transfer_size);
+        }
+        // Calculate a tile: not in first iteration, not in last iteration
+        if (snrt_is_compute_core() && i > 0 && i < iterations + 1) {
+            snax_mac_setup_simple_mult(
+                local_a + (i - 1) * tile_size, local_b + (i - 1) * tile_size,
+                local_o + (i - 1) * tile_size, tile_size);
+            snax_mac_launch();
+            snax_mac_sw_barrier();
+        }
+        // Load out data: not in first two iterations
+        if (snrt_is_dm_core() && i > 1) {
+            // Use data mover core to bring data from TCDM to L3
+            snrt_dma_start_1d(OUT_TEST + (i - 2) * tile_size,
+                              local_o + (i - 2) * tile_size, transfer_size);
+        }
+        // Wait until DMA transfers are done
+        snrt_cluster_hw_barrier();
+    }
+    uint32_t cycles_post_loop = snrt_mcycle();
+    // Move tiled output data from L3 back to TCDM to check for correctness
+    if (snrt_is_dm_core()) {
+        size_t vector_size = VEC_LEN * sizeof(uint32_t);
+        snrt_dma_start_1d(local_o, OUT_TEST, vector_size);
+    }
+    // Wait until DMA transfer is done
+    snrt_cluster_hw_barrier();
+
+    // Perform correctness check
+    int err = 0;
+    if (snrt_is_compute_core()) {
+        err = check_simple_mult(local_o, OUT, VEC_LEN);
+        // Compute using CPU multiplier and check
+        uint32_t cpu_output[VEC_LEN];
+        cpu_simple_mult(local_a, local_b, cpu_output, VEC_LEN);
+        // Compare CPU result with golden model
+        err += check_simple_mult(cpu_output, OUT, VEC_LEN);
+    };
+    return err;
+}