forked from pulp-platform/snitch_cluster
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add tiled simple mult example (pulp-platform#33)
* sw: Copy simple mac example and refactor * sw: Refactor accelerator setup * sw: Add function for launching accelerator * sw: Refactor correctness check * sw: Working multiple SNAX runs * sw: Added duplicate runs * sw: Make duplicate runs in for loop * sw: Fix lint for code * sw: Move snax_mac_sw_clear to sw_barrier * sw: Tile execution of MAC operation * sw: Add data in to tiled implementation * sw: Add output transfer to double buffering loop * sw: Add license headers * sw: Fix linting errors * sw: Add tiled mac to makefile and runfile * sw: Add datagen * sw: Fix lint * sw: Resolve minor comments * sw: Add datagen.py to Makefile * sw: Fix lint * sw: Add performance coutners outside tiling loop * sw: Fix lint --------- Co-authored-by: rgantonio <[email protected]>
- Loading branch information
1 parent
f4d123f
commit a4ab575
Showing
6 changed files
with
255 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Copyright 2023 ETH Zurich and University of Bologna. | ||
# Licensed under the Apache License, Version 2.0, see LICENSE for details. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# Ryan Antonio <[email protected]> | ||
# Josse Van Delm <[email protected]> | ||
|
||
PYTHON = python3 | ||
MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) | ||
DATA_DIR := $(realpath $(MK_DIR)/data) | ||
SRC_DIR := $(realpath $(MK_DIR)/src) | ||
DATAGEN_PY = $(DATA_DIR)/datagen.py | ||
DATA_H = $(DATA_DIR)/data.h | ||
|
||
LENGTH ?= 512 | ||
TILE_SIZE ?= 32 | ||
|
||
APP ?= snax-mac-tiled | ||
SRCS ?= $(SRC_DIR)/snax-mac-tiled.c | ||
INCDIRS += $(DATA_DIR) $(SRC_DIR) | ||
|
||
$(DATA_H): $(DATAGEN_PY) | ||
$< --length=$(LENGTH) --tile_size=$(TILE_SIZE) > $@ | ||
|
||
.PHONY: clean-data clean | ||
|
||
clean-data: | ||
rm -f $(DATA_H) | ||
|
||
clean: clean-data | ||
|
||
include ../common.mk | ||
|
||
$(DEP): $(DATA_H) | ||
|
58 changes: 58 additions & 0 deletions
58
target/snitch_cluster/sw/apps/snax-mac-tiled/data/datagen.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/usr/bin/env python3 | ||
import sys | ||
import argparse | ||
import numpy as np | ||
import os | ||
|
||
# Add data utility path | ||
sys.path.append(os.path.join(os.path.dirname(__file__), | ||
"../../../../../../util/sim/")) | ||
from data_utils import format_scalar_definition, \ | ||
format_vector_definition # noqa: E402 | ||
|
||
# Hard parameters | ||
MIN = 0 | ||
MAX = 100 | ||
|
||
|
||
def golden_model(a, b): | ||
return a*b | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
'--length', | ||
type=int, | ||
help='Vector length of the input and output vectors') | ||
parser.add_argument( | ||
'--tile_size', | ||
type=int, | ||
help='Tile size length. vector_length/tile_size has to be an integer number!') | ||
args = parser.parse_args() | ||
tile_size = args.tile_size | ||
length = args.length | ||
assert length % tile_size == 0, "vector_length/tile_size has to be an integer number!" | ||
|
||
# Randomly generate inputs | ||
a = np.random.randint(MIN, MAX, length) | ||
b = np.random.randint(MIN, MAX, length) | ||
out = golden_model(a, b) | ||
out_test = np.zeros(length, dtype=int) | ||
|
||
# Format header file | ||
l_str = format_scalar_definition('uint32_t', 'VEC_LEN', length) | ||
t_str = format_scalar_definition('uint32_t', 'TILE_SIZE', tile_size) | ||
a_str = format_vector_definition('uint32_t', 'A', a) | ||
b_str = format_vector_definition('uint32_t', 'B', b) | ||
out_str = format_vector_definition('uint32_t', 'OUT', out) | ||
out_test_str = format_vector_definition('uint32_t', 'OUT_TEST', out_test) | ||
f_str = '\n\n'.join([l_str, t_str, a_str, b_str, out_str, out_test_str]) | ||
f_str += '\n' | ||
|
||
# Write to stdout | ||
print(f_str) | ||
|
||
|
||
if __name__ == '__main__': | ||
sys.exit(main()) |
159 changes: 159 additions & 0 deletions
159
target/snitch_cluster/sw/apps/snax-mac-tiled/src/snax-mac-tiled.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
// Copyright 2023 KU Leuven | ||
// Licensed under the Apache License, Version 2.0, see LICENSE for details. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
// Josse Van Delm <[email protected]> | ||
// Ryan Antonio <[email protected]> | ||
|
||
#include "snrt.h" | ||
|
||
#include "data.h" | ||
|
||
// * mac_mode = 0 | ||
// performs multiply-accumulate over elements to perform dot product | ||
// * simple_mult_mode = 1 | ||
// performs simple elementwise multiplication | ||
enum mode { mac_mode, simple_mult_mode }; | ||
|
||
void snax_mac_launch() { | ||
// Write start CSR to launch accelerator | ||
write_csr(0x3c0, 0); | ||
} | ||
|
||
void snax_mac_sw_clear() { | ||
// write 0x3c5 to clear HWPE accelerator | ||
// Otherwise the accelerator goes into undefined behaviour: | ||
// It might stall/continue indefinitely | ||
write_csr(0x3c5, 0); | ||
asm volatile("nop\n"); | ||
asm volatile("nop\n"); | ||
asm volatile("nop\n"); | ||
} | ||
|
||
void snax_mac_sw_barrier() { | ||
// poll csr 0x3c3 until HWPE MAC accelerator is finished | ||
while (read_csr(0x3c3)) { | ||
}; | ||
// This is necessary for the HWPE MAC accelerator to allow multiple runs | ||
snax_mac_sw_clear(); | ||
} | ||
|
||
void snax_mac_setup_simple_mult(uint32_t* a, uint32_t* b, uint32_t* o, | ||
uint32_t vector_length) { | ||
/* Setup the hwpe_mac accelerator in simple_mult mode. | ||
* This computes the product A*B in 32 bits and stores it starting | ||
* from the pointer given by o | ||
* args: | ||
* a: pointer in TCDM (L1) to vector A | ||
* b: pointer in TCDM (L1) to vector B | ||
* o: pointer in TCDM (L1) to where output O must be stored | ||
* vector_length: length of A,B and O | ||
* */ | ||
|
||
// Set addresses | ||
write_csr(0x3d0, (uint32_t)a); | ||
write_csr(0x3d1, (uint32_t)b); | ||
write_csr(0x3d3, (uint32_t)o); | ||
|
||
// Set configs | ||
write_csr(0x3d4, 1); // Number of iterations | ||
write_csr(0x3d5, vector_length); // Vector length | ||
write_csr(0x3d6, simple_mult_mode); // Set simple multiplication | ||
} | ||
|
||
void cpu_simple_mult(uint32_t* a, uint32_t* b, uint32_t* o, | ||
uint32_t vector_length) { | ||
for (uint32_t i = 0; i < vector_length; i++) { | ||
o[i] = a[i] * b[i]; | ||
}; | ||
} | ||
|
||
int check_simple_mult(uint32_t* output, uint32_t* output_golden, | ||
uint32_t vector_length) { | ||
/* | ||
* Compare output to output_golden with length vector_length | ||
*/ | ||
uint32_t err = 0; | ||
for (uint32_t i = 0; i < vector_length; i++) { | ||
// Check if output is same as golden output | ||
if (output[i] != output_golden[i]) { | ||
err++; | ||
}; | ||
}; | ||
return err; | ||
} | ||
|
||
int main() { | ||
uint32_t *local_a, *local_b; | ||
uint32_t* local_o; | ||
|
||
// Allocate space in TCDM | ||
local_a = (uint32_t*)snrt_l1_next(); | ||
local_b = local_a + VEC_LEN; | ||
local_o = local_b + VEC_LEN; | ||
|
||
uint32_t tile_size = TILE_SIZE; | ||
// Warning: Manually make sure this is an integer number! | ||
uint32_t iterations = VEC_LEN / tile_size; | ||
size_t transfer_size = tile_size * sizeof(uint32_t); | ||
// Main tiling loop | ||
// I: | ||
// | (0) | (1) | (2) | (3) | (4) | (5) | | ||
// Phase: | ||
// |in---| cal |--out| | ||
// |in---| cal | out| | ||
// |in---| cal |--out| | ||
// |in---| cal |--out| | ||
// | ||
// Note:"in" and "out" can technically not execute at the same time, | ||
// so they are "waiting" for each other to release the DMA. | ||
// | ||
// Add + 2 to iterations for end of pipeline | ||
uint32_t cycles_pre_loop = snrt_mcycle(); | ||
for (uint32_t i = 0; i < iterations + 2; i++) { | ||
// Load in data: not in last two iterations | ||
if (snrt_is_dm_core() && i < iterations) { | ||
// Use data mover core to bring data from L3 to TCDM | ||
snrt_dma_start_1d(local_a + i * tile_size, A + i * tile_size, | ||
transfer_size); | ||
snrt_dma_start_1d(local_b + i * tile_size, B + i * tile_size, | ||
transfer_size); | ||
} | ||
// Calculate a tile: not in first iteration, not in last iteration | ||
if (snrt_is_compute_core() && i > 0 && i < iterations + 1) { | ||
snax_mac_setup_simple_mult( | ||
local_a + (i - 1) * tile_size, local_b + (i - 1) * tile_size, | ||
local_o + (i - 1) * tile_size, tile_size); | ||
snax_mac_launch(); | ||
snax_mac_sw_barrier(); | ||
} | ||
// Load out data: not in first two iterations | ||
if (snrt_is_dm_core() && i > 1) { | ||
// Use data mover core to bring data from TCDM to L3 | ||
snrt_dma_start_1d(OUT_TEST + (i - 2) * tile_size, | ||
local_o + (i - 2) * tile_size, transfer_size); | ||
} | ||
// Wait until DMA transfers are done | ||
snrt_cluster_hw_barrier(); | ||
} | ||
uint32_t cycles_post_loop = snrt_mcycle(); | ||
// Move tiled output data from L3 back to TCDM to check for correctness | ||
if (snrt_is_dm_core()) { | ||
size_t vector_size = VEC_LEN * sizeof(uint32_t); | ||
snrt_dma_start_1d(local_o, OUT_TEST, vector_size); | ||
} | ||
// Wait until DMA transfer is done | ||
snrt_cluster_hw_barrier(); | ||
|
||
// Perform correctness check | ||
int err = 0; | ||
if (snrt_is_compute_core()) { | ||
err = check_simple_mult(local_o, OUT, VEC_LEN); | ||
// Compute using CPU multiplier and check | ||
uint32_t cpu_output[VEC_LEN]; | ||
cpu_simple_mult(local_a, local_b, cpu_output, VEC_LEN); | ||
// Compare CPU result with golden model | ||
err += check_simple_mult(cpu_output, OUT, VEC_LEN); | ||
}; | ||
return err; | ||
} |