Skip to content

Commit

Permalink
Add tiled simple mult example (pulp-platform#33)
Browse files Browse the repository at this point in the history
* sw: Copy simple mac example and refactor

* sw: Refactor accelerator setup

* sw: Add function for launching accelerator

* sw: Refactor correctness check

* sw: Working multiple SNAX runs

* sw: Added duplicate runs

* sw: Make duplicate runs in for loop

* sw: Fix lint for code

* sw: Move snax_mac_sw_clear to sw_barrier

* sw: Tile execution of MAC operation

* sw: Add data in to tiled implementation

* sw: Add output transfer to double buffering loop

* sw: Add license headers

* sw: Fix linting errors

* sw: Add tiled mac to makefile and runfile

* sw: Add datagen

* sw: Fix lint

* sw: Resolve minor comments

* sw: Add datagen.py to Makefile

* sw: Fix lint

* sw: Add performance coutners outside tiling loop

* sw: Fix lint

---------

Co-authored-by: rgantonio <[email protected]>
  • Loading branch information
JosseVanDelm and rgantonio committed Dec 11, 2023
1 parent f4d123f commit a4ab575
Show file tree
Hide file tree
Showing 6 changed files with 255 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ jobs:
hw/snax_gemm/src/*
target/snitch_cluster/sw/apps/snax-mac/*
target/snitch_cluster/sw/apps/snax-mac-simple/*
target/snitch_cluster/sw/apps/snax-mac-tiled/*
target/snitch_cluster/sw/apps/snax-gemm-base/*
target/snitch_cluster/sw/apps/snax-gemm-engine/*
Expand Down
1 change: 1 addition & 0 deletions target/snitch_cluster/sw/apps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ SUBDIRS += dnn/softmax
SUBDIRS += montecarlo/pi_estimation
SUBDIRS += snax-mac
SUBDIRS += snax-mac-simple
SUBDIRS += snax-mac-tiled
SUBDIRS += snax-gemm-engine
SUBDIRS += snax-gemm-base

Expand Down
1 change: 1 addition & 0 deletions target/snitch_cluster/sw/apps/snax-mac-run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ runs:
- app: dnn/gemm
- app: snax-mac
- app: snax-mac-simple
- app: snax-mac-tiled
# dnn/gelu # seems like it stalls
# dnn/conv2d # fails with exit code 32
# dnn/fusedconv # fails newly
Expand Down
35 changes: 35 additions & 0 deletions target/snitch_cluster/sw/apps/snax-mac-tiled/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Ryan Antonio <[email protected]>
# Josse Van Delm <[email protected]>

PYTHON = python3
MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
DATA_DIR := $(realpath $(MK_DIR)/data)
SRC_DIR := $(realpath $(MK_DIR)/src)
DATAGEN_PY = $(DATA_DIR)/datagen.py
DATA_H = $(DATA_DIR)/data.h

LENGTH ?= 512
TILE_SIZE ?= 32

APP ?= snax-mac-tiled
SRCS ?= $(SRC_DIR)/snax-mac-tiled.c
INCDIRS += $(DATA_DIR) $(SRC_DIR)

$(DATA_H): $(DATAGEN_PY)
$< --length=$(LENGTH) --tile_size=$(TILE_SIZE) > $@

.PHONY: clean-data clean

clean-data:
rm -f $(DATA_H)

clean: clean-data

include ../common.mk

$(DEP): $(DATA_H)

58 changes: 58 additions & 0 deletions target/snitch_cluster/sw/apps/snax-mac-tiled/data/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python3
import sys
import argparse
import numpy as np
import os

# Add data utility path
sys.path.append(os.path.join(os.path.dirname(__file__),
"../../../../../../util/sim/"))
from data_utils import format_scalar_definition, \
format_vector_definition # noqa: E402

# Hard parameters
MIN = 0
MAX = 100


def golden_model(a, b):
return a*b


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'--length',
type=int,
help='Vector length of the input and output vectors')
parser.add_argument(
'--tile_size',
type=int,
help='Tile size length. vector_length/tile_size has to be an integer number!')
args = parser.parse_args()
tile_size = args.tile_size
length = args.length
assert length % tile_size == 0, "vector_length/tile_size has to be an integer number!"

# Randomly generate inputs
a = np.random.randint(MIN, MAX, length)
b = np.random.randint(MIN, MAX, length)
out = golden_model(a, b)
out_test = np.zeros(length, dtype=int)

# Format header file
l_str = format_scalar_definition('uint32_t', 'VEC_LEN', length)
t_str = format_scalar_definition('uint32_t', 'TILE_SIZE', tile_size)
a_str = format_vector_definition('uint32_t', 'A', a)
b_str = format_vector_definition('uint32_t', 'B', b)
out_str = format_vector_definition('uint32_t', 'OUT', out)
out_test_str = format_vector_definition('uint32_t', 'OUT_TEST', out_test)
f_str = '\n\n'.join([l_str, t_str, a_str, b_str, out_str, out_test_str])
f_str += '\n'

# Write to stdout
print(f_str)


if __name__ == '__main__':
sys.exit(main())
159 changes: 159 additions & 0 deletions target/snitch_cluster/sw/apps/snax-mac-tiled/src/snax-mac-tiled.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// Copyright 2023 KU Leuven
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Josse Van Delm <[email protected]>
// Ryan Antonio <[email protected]>

#include "snrt.h"

#include "data.h"

// * mac_mode = 0
// performs multiply-accumulate over elements to perform dot product
// * simple_mult_mode = 1
// performs simple elementwise multiplication
enum mode { mac_mode, simple_mult_mode };

void snax_mac_launch() {
// Write start CSR to launch accelerator
write_csr(0x3c0, 0);
}

void snax_mac_sw_clear() {
// write 0x3c5 to clear HWPE accelerator
// Otherwise the accelerator goes into undefined behaviour:
// It might stall/continue indefinitely
write_csr(0x3c5, 0);
asm volatile("nop\n");
asm volatile("nop\n");
asm volatile("nop\n");
}

void snax_mac_sw_barrier() {
// poll csr 0x3c3 until HWPE MAC accelerator is finished
while (read_csr(0x3c3)) {
};
// This is necessary for the HWPE MAC accelerator to allow multiple runs
snax_mac_sw_clear();
}

void snax_mac_setup_simple_mult(uint32_t* a, uint32_t* b, uint32_t* o,
uint32_t vector_length) {
/* Setup the hwpe_mac accelerator in simple_mult mode.
* This computes the product A*B in 32 bits and stores it starting
* from the pointer given by o
* args:
* a: pointer in TCDM (L1) to vector A
* b: pointer in TCDM (L1) to vector B
* o: pointer in TCDM (L1) to where output O must be stored
* vector_length: length of A,B and O
* */

// Set addresses
write_csr(0x3d0, (uint32_t)a);
write_csr(0x3d1, (uint32_t)b);
write_csr(0x3d3, (uint32_t)o);

// Set configs
write_csr(0x3d4, 1); // Number of iterations
write_csr(0x3d5, vector_length); // Vector length
write_csr(0x3d6, simple_mult_mode); // Set simple multiplication
}

void cpu_simple_mult(uint32_t* a, uint32_t* b, uint32_t* o,
uint32_t vector_length) {
for (uint32_t i = 0; i < vector_length; i++) {
o[i] = a[i] * b[i];
};
}

int check_simple_mult(uint32_t* output, uint32_t* output_golden,
uint32_t vector_length) {
/*
* Compare output to output_golden with length vector_length
*/
uint32_t err = 0;
for (uint32_t i = 0; i < vector_length; i++) {
// Check if output is same as golden output
if (output[i] != output_golden[i]) {
err++;
};
};
return err;
}

int main() {
uint32_t *local_a, *local_b;
uint32_t* local_o;

// Allocate space in TCDM
local_a = (uint32_t*)snrt_l1_next();
local_b = local_a + VEC_LEN;
local_o = local_b + VEC_LEN;

uint32_t tile_size = TILE_SIZE;
// Warning: Manually make sure this is an integer number!
uint32_t iterations = VEC_LEN / tile_size;
size_t transfer_size = tile_size * sizeof(uint32_t);
// Main tiling loop
// I:
// | (0) | (1) | (2) | (3) | (4) | (5) |
// Phase:
// |in---| cal |--out|
// |in---| cal | out|
// |in---| cal |--out|
// |in---| cal |--out|
//
// Note:"in" and "out" can technically not execute at the same time,
// so they are "waiting" for each other to release the DMA.
//
// Add + 2 to iterations for end of pipeline
uint32_t cycles_pre_loop = snrt_mcycle();
for (uint32_t i = 0; i < iterations + 2; i++) {
// Load in data: not in last two iterations
if (snrt_is_dm_core() && i < iterations) {
// Use data mover core to bring data from L3 to TCDM
snrt_dma_start_1d(local_a + i * tile_size, A + i * tile_size,
transfer_size);
snrt_dma_start_1d(local_b + i * tile_size, B + i * tile_size,
transfer_size);
}
// Calculate a tile: not in first iteration, not in last iteration
if (snrt_is_compute_core() && i > 0 && i < iterations + 1) {
snax_mac_setup_simple_mult(
local_a + (i - 1) * tile_size, local_b + (i - 1) * tile_size,
local_o + (i - 1) * tile_size, tile_size);
snax_mac_launch();
snax_mac_sw_barrier();
}
// Load out data: not in first two iterations
if (snrt_is_dm_core() && i > 1) {
// Use data mover core to bring data from TCDM to L3
snrt_dma_start_1d(OUT_TEST + (i - 2) * tile_size,
local_o + (i - 2) * tile_size, transfer_size);
}
// Wait until DMA transfers are done
snrt_cluster_hw_barrier();
}
uint32_t cycles_post_loop = snrt_mcycle();
// Move tiled output data from L3 back to TCDM to check for correctness
if (snrt_is_dm_core()) {
size_t vector_size = VEC_LEN * sizeof(uint32_t);
snrt_dma_start_1d(local_o, OUT_TEST, vector_size);
}
// Wait until DMA transfer is done
snrt_cluster_hw_barrier();

// Perform correctness check
int err = 0;
if (snrt_is_compute_core()) {
err = check_simple_mult(local_o, OUT, VEC_LEN);
// Compute using CPU multiplier and check
uint32_t cpu_output[VEC_LEN];
cpu_simple_mult(local_a, local_b, cpu_output, VEC_LEN);
// Compare CPU result with golden model
err += check_simple_mult(cpu_output, OUT, VEC_LEN);
};
return err;
}

0 comments on commit a4ab575

Please sign in to comment.