Skip to content

Commit

Permalink
sw: Add DOTP benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
mp-17 committed Jun 11, 2024
1 parent e90dceb commit 22f5c13
Show file tree
Hide file tree
Showing 9 changed files with 329 additions and 1 deletion.
3 changes: 2 additions & 1 deletion sw/blas/blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
#pragma once

#include "axpy/src/axpy.h"
#include "gemm/src/gemm.h"
#include "gemm/src/gemm.h"
#include "dotp/src/dotp.h"
31 changes: 31 additions & 0 deletions sw/blas/dotp/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

# Usage of absolute paths is required to externally include this Makefile
MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
DATA_DIR := $(realpath $(MK_DIR)/data)
SRC_DIR := $(realpath $(MK_DIR)/src)

DATA_CFG ?= $(DATA_DIR)/params.json
SECTION ?=

APP ?= dotp
SRCS ?= $(realpath $(SRC_DIR)/main.c)
INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR)

DATAGEN_PY = $(MK_DIR)/scripts/datagen.py
DATA_H ?= $(DATA_DIR)/data.h

$(dir $(DATA_H)):
mkdir -p $@

$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H))
$< -c $(DATA_CFG) --section="$(SECTION)" > $@

.PHONY: clean-data clean

clean-data:
rm -f $(DATA_H)

clean: clean-data
7 changes: 7 additions & 0 deletions sw/blas/dotp/data/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

{
n: 4096
}
51 changes: 51 additions & 0 deletions sw/blas/dotp/scripts/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python3
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

import numpy as np
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
from data_utils import format_scalar_definition, format_array_definition, \
format_array_declaration, format_ifdef_wrapper, DataGen # noqa: E402


class AxpyDataGen(DataGen):

MIN = -1000
MAX = +1000
# AXI splits bursts crossing 4KB address boundaries. To minimize
# the occurrence of these splits the data should be aligned to 4KB
BURST_ALIGNMENT = 4096

def golden_model(self, x, y):
return np.dot(x, y)

def emit_header(self, **kwargs):
header = [super().emit_header()]

n = kwargs['n']
x = np.random.uniform(self.MIN, self.MAX, n)
y = np.random.uniform(self.MIN, self.MAX, n)
g = self.golden_model(x, y)

assert (n % 8) == 0, "n must be an integer multiple of the number of cores"

header += [format_scalar_definition('const uint32_t', 'n', n)]
header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
section=kwargs['section'])]
header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
section=kwargs['section'])]
header += [format_array_declaration('double', 'z', [n], alignment=self.BURST_ALIGNMENT,
section=kwargs['section'])]
result_def = format_scalar_definition('double', 'g', g)
header += [format_ifdef_wrapper('BIST', result_def)]
header = '\n\n'.join(header)

return header


if __name__ == '__main__':
sys.exit(AxpyDataGen().main())
32 changes: 32 additions & 0 deletions sw/blas/dotp/scripts/verify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env python3
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

import sys
from pathlib import Path
from datagen import AxpyDataGen

sys.path.append(str(Path(__file__).parent / '../../../../util/sim/'))
from verif_utils import Verifier # noqa: E402


class AxpyVerifier(Verifier):

OUTPUT_UIDS = ['z']

def get_actual_results(self):
return self.get_output_from_symbol('z', 'double')

def get_expected_results(self):
a = self.get_input_from_symbol('a', 'double')
x = self.get_input_from_symbol('x', 'double')
y = self.get_input_from_symbol('y', 'double')
return AxpyDataGen().golden_model(a, x, y)

def check_results(self, *args):
return super().check_results(*args, rtol=1e-10)


if __name__ == "__main__":
sys.exit(AxpyVerifier().main())
88 changes: 88 additions & 0 deletions sw/blas/dotp/src/dotp.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#include "snrt.h"

inline void dotp_seq (uint32_t N, double *input_A, double *input_B, double *output) {
// Start of SSR region.
register volatile double ft0 asm("ft0");
register volatile double ft1 asm("ft1");
asm volatile(""
: "=f"(ft0), "=f"(ft1));

snrt_ssr_loop_1d(SNRT_SSR_DM0, N, sizeof(double));
snrt_ssr_loop_1d(SNRT_SSR_DM1, N, sizeof(double));

snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, input_A);
snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, input_B);

register volatile double res_ssr asm("fs0") = 0;

snrt_ssr_enable();

const register uint32_t Nm1 asm("t0") = N - 1;
asm volatile(
"frep.o %[n_frep], 1, 0, 0 \n"
"fmadd.d %0, ft0, ft1, %0"
: "=f"(res_ssr) /* output operands */
: "f"(ft0), "f"(ft1), "0"(res_ssr), [n_frep]"r"(Nm1) /* input operands */
:);

// End of SSR region.
snrt_fpu_fence();
snrt_ssr_disable();
asm volatile(""
:
: "f"(ft0), "f"(ft1));
output[0] = res_ssr;
}

inline void dotp_seq_4_acc (uint32_t N, double *input_A, double *input_B, double *output) {
// Start of SSR region.
register volatile double ft0 asm("ft0");
register volatile double ft1 asm("ft1");
asm volatile(""
: "=f"(ft0), "=f"(ft1));

snrt_ssr_loop_1d(SNRT_SSR_DM0, N, sizeof(double));
snrt_ssr_loop_1d(SNRT_SSR_DM1, N, sizeof(double));

snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, input_A);
snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, input_B);

register volatile double res_ssr_0 asm("fs0") = 0;
register volatile double res_ssr_1 asm("fs1") = 0;
register volatile double res_ssr_2 asm("fs2") = 0;
register volatile double res_ssr_3 asm("fs3") = 0;

snrt_ssr_enable();

const register uint32_t Nm1 asm("t0") = (N >> 2) - 1;
asm volatile(
"frep.o %[n_frep], 4, 0, 0 \n"
"fmadd.d %0, ft0, ft1, %0 \n"
"fmadd.d %1, ft0, ft1, %1 \n"
"fmadd.d %2, ft0, ft1, %2 \n"
"fmadd.d %3, ft0, ft1, %3"
: "=f"(res_ssr_0), "=f"(res_ssr_1), "=f"(res_ssr_2), "=f"(res_ssr_3) /* output operands */
: "f"(ft0), "f"(ft1), "0"(res_ssr_0), "1"(res_ssr_1), "2"(res_ssr_2), "3"(res_ssr_3), [n_frep]"r"(Nm1) /* input operands */
:);

// End of SSR region.
snrt_fpu_fence();
snrt_ssr_disable();

asm volatile(
"fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_1] \n"
"fadd.d %[res_ssr_2], %[res_ssr_2], %[res_ssr_3] \n"
"fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_2]"
: [res_ssr_0]"=f"(res_ssr_0), [res_ssr_2]"=f"(res_ssr_2) /* output operands */
: [res_ssr_1]"f"(res_ssr_1), [res_ssr_3]"f"(res_ssr_3) /* input operands */
:);

asm volatile(""
:
: "f"(ft0), "f"(ft1));
output[0] = res_ssr_0;
}
107 changes: 107 additions & 0 deletions sw/blas/dotp/src/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#include "snrt.h"

#include "printf.h"

#define XSSR
#include "dotp.h"
#include "data.h"

int main() {
double *local_x, *local_y, *local_z;
double *remote_x, *remote_y, *remote_z;

volatile double sum;

uint32_t start_cycle, end_cycle;

// Calculate size and pointers for each cluster
uint32_t frac = n / snrt_cluster_num();
uint32_t offset = frac * snrt_cluster_idx();
remote_x = x + offset;
remote_y = y + offset;
remote_z = z + snrt_cluster_idx();

// Allocate space in TCDM
local_x = (double *)snrt_l1_next();
local_y = local_x + frac;
local_z = local_y + frac;

// Copy data in TCDM
if (snrt_is_dm_core()) {
size_t size = frac * sizeof(double);
snrt_dma_start_1d(local_x, remote_x, size);
snrt_dma_start_1d(local_y, remote_y, size);
snrt_dma_wait_all();
}

// Calculate TCDM size and pointers for each core
int core_idx = snrt_cluster_core_idx();
int frac_core = n / snrt_cluster_compute_core_num();
int offset_core = core_idx * frac_core;
local_x += offset_core;
local_y += offset_core;
local_z += core_idx;

snrt_cluster_hw_barrier();

// Compute
if (!snrt_is_dm_core()) {
start_cycle = snrt_mcycle();
dotp_seq_4_acc(frac_core, local_x, local_y, local_z);
snrt_cluster_hw_barrier();

#ifndef _DOTP_EXCLUDE_FINAL_SYNC_
if (!snrt_cluster_core_idx()) {
sum = 0;
for (uint32_t i = 0; i < snrt_cluster_compute_core_num(); ++i) {
sum += local_z[i];
}
}
snrt_fpu_fence();
#endif

end_cycle = snrt_mcycle();
}

snrt_cluster_hw_barrier();

if (!snrt_cluster_core_idx()) {
unsigned int runtime = end_cycle - start_cycle;
double performance = (double) (2 * n - 1) / runtime;
double util = 100 * (performance / (2 * snrt_cluster_compute_core_num()));

printf("Core %d execution time: %u cycles\nPerformance: %f DP-FLOP/Cycle\nUtilization: %f%%\n",
snrt_cluster_core_idx(), runtime, performance, util);
}

snrt_cluster_hw_barrier();

// Copy data out of TCDM
if (snrt_is_dm_core()) {
size_t size = frac_core * sizeof(double);
snrt_dma_start_1d(remote_z, local_z, size);
snrt_dma_wait_all();
}

snrt_cluster_hw_barrier();

// TODO: currently only works for single cluster otherwise need to
// synchronize all cores here
#ifdef BIST
uint32_t nerr = 1;

// Check computation is correct
if (snrt_global_core_idx() == 0) {
if (sum == g) nerr--;
printf("%f %f\n", sum, g);
}

return nerr;
#endif

return 0;
}
1 change: 1 addition & 0 deletions target/snitch_cluster/sw.mk
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ APPS = sw/apps/lto
APPS += sw/apps/nop
APPS += sw/apps/blas/axpy
APPS += sw/apps/blas/gemm
APPS += sw/apps/blas/dotp
APPS += sw/apps/dnn/batchnorm
APPS += sw/apps/dnn/conv2d
APPS += sw/apps/dnn/fusedconv
Expand Down
10 changes: 10 additions & 0 deletions target/snitch_cluster/sw/apps/blas/dotp/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright 2024 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Matteo Perotti <[email protected]>

include ../../../../../../sw/blas/dotp/Makefile
include ../../common.mk

$(DEP): $(DATA_H)

0 comments on commit 22f5c13

Please sign in to comment.