Skip to content

Commit

Permalink
sw: Add GEMV, K-Means and update ATAX, Correlation, Covariance (#190)
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca authored Aug 29, 2024
1 parent 3ba276e commit 3a2d92e
Show file tree
Hide file tree
Showing 36 changed files with 1,139 additions and 123 deletions.
1 change: 1 addition & 0 deletions python-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pyflexfloat
pytablewriter
pytest
pyyaml
scikit-learn
tabulate
termcolor
yamllint
Expand Down
22 changes: 18 additions & 4 deletions sw/apps/atax/scripts/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Author: Jose Pedro Castro Fonseca <jose.pc.fonseca@gmail, [email protected]>
# Author: Jose Pedro Castro Fonseca <[email protected]>
# Luca Colagrande <[email protected]>

import numpy as np
Expand All @@ -21,17 +21,31 @@ class AtaxDataGen(du.DataGen):
def golden_model(self, A, x):
return np.matmul(A.transpose(), np.matmul(A, x))

def validate(self, M, N, **kwargs):
assert (N % 8) == 0, "N must be an integer multiple of the number of cores"

# Calculate total TCDM occupation
a_size = M * N * 8
x_size = N * 8
y_size = N * 8
tmp_size = M * 8
total_size = a_size
total_size += x_size
total_size += y_size
total_size += tmp_size
du.validate_tcdm_footprint(total_size)

def emit_header(self, **kwargs):
header = [super().emit_header()]

# Validate parameters
self.validate(**kwargs)

M, N = kwargs['M'], kwargs['N']
A = du.generate_random_array((M, N))
x = du.generate_random_array((N, 1))
y = self.golden_model(A, x)

assert (M % 8) == 0, "M must be an integer multiple of the number of cores"
assert (N % 8) == 0, "N must be an integer multiple of the number of cores"

A = A.flatten()
x = x.flatten()
y = y.flatten()
Expand Down
16 changes: 16 additions & 0 deletions sw/apps/atax/src/args.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Author: Luca Colagrande <[email protected]>

#pragma once
#include <stdint.h>

typedef struct {
uint32_t M;
uint32_t N;
uint64_t A_addr;
uint64_t x_addr;
uint64_t y_addr;
} atax_args_t;
103 changes: 87 additions & 16 deletions sw/apps/atax/src/atax.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,114 @@
// Luca Colagrande <[email protected]>

#include <stdint.h>
#include "args.h"
#include "blas.h"
#include "snrt.h"

void kernel_atax(uint32_t M, uint32_t N, double *A, double *x, double *y,
double *tmp) {
static inline void atax(uint32_t M, uint32_t N, double *A, double *x, double *y,
double *tmp) {
double tmp_fs;
int core_range, core_offset;
int core_range, core_offset, cluster_core_offset;

// tmp = A * x
if (snrt_is_compute_core()) {
core_range = M / snrt_cluster_compute_core_num();
core_offset = snrt_cluster_core_idx() * core_range;
for (int i1 = 0; i1 < core_range; i1++) {
int i = core_offset + i1;
tmp_fs = 0.0;
for (int j = 0; j < N; j++) {
tmp_fs += A[i * N + j] * x[j];
}
tmp[i] = tmp_fs;
}
snrt_mcycle();
gemv(0, M, N, 1, A, x, 1, tmp);
snrt_mcycle();
}

snrt_cluster_hw_barrier();

// y = At * tmp
if (snrt_is_compute_core()) {
core_range = N / snrt_cluster_compute_core_num();
core_offset = snrt_cluster_core_idx() * core_range;
snrt_mcycle();
core_range = N / snrt_global_compute_core_num();
core_offset = snrt_global_compute_core_idx() * core_range;
cluster_core_offset = snrt_cluster_core_idx() * core_range;
for (int j1 = 0; j1 < core_range; j1++) {
int j = core_offset + j1;
int cluster_j = cluster_core_offset + j1;
tmp_fs = 0.0;
for (int i = 0; i < M; i++) {
// The order of the for loops was exchanged, so that each loop
// reduces in y at position j, iterating through the i
// positions.
tmp_fs += A[i * N + j] * tmp[i];
}
y[j] = tmp_fs;
y[cluster_j] = tmp_fs;
}
snrt_fpu_fence();
snrt_mcycle();
}
}

void atax_job(void *args) {
double *local_A;
double *local_x;
double *local_y;
double *local_tmp;
atax_args_t *local_args;

#ifndef JOB_ARGS_PRELOADED
// Allocate space for job arguments in TCDM
local_args = (atax_args_t *)snrt_l1_alloc_cluster_local(sizeof(atax_args_t),
sizeof(double));

// Copy job arguments to TCDM
if (snrt_is_dm_core()) {
snrt_dma_start_1d(local_args, args, sizeof(atax_args_t));
snrt_dma_wait_all();
}
snrt_cluster_hw_barrier();
#else
local_args = (atax_args_t *)args;
#endif

// Aliases
uint32_t M = local_args->M;
uint32_t N = local_args->N;
double *A = (double *)(local_args->A_addr);
double *x = (double *)(local_args->x_addr);
double *y = (double *)(local_args->y_addr);

// Allocate local variables
size_t size_A = M * N * sizeof(double);
size_t size_x = N * sizeof(double);
size_t size_y = N * sizeof(double);
size_t size_tmp = M * sizeof(double);
size_t size_y_tile = size_y / snrt_cluster_num();
local_A = snrt_l1_alloc_cluster_local(size_A, sizeof(double));
local_x = snrt_l1_alloc_cluster_local(size_x, sizeof(double));
local_y = snrt_l1_alloc_cluster_local(size_y_tile, sizeof(double));
local_tmp = snrt_l1_alloc_cluster_local(size_tmp, sizeof(double));

// Initialize input matrices
if (snrt_is_dm_core()) {
snrt_dma_start_1d(local_A, A, size_A);
snrt_dma_start_1d(local_x, x, size_x);
snrt_dma_wait_all();
}
snrt_mcycle();
snrt_cluster_hw_barrier();

// Compute
atax(M, N, local_A, local_x, local_y, local_tmp);
snrt_cluster_hw_barrier();
snrt_mcycle();

// Writeback results
if (snrt_is_dm_core()) {
snrt_dma_store_1d_tile(y, local_y, snrt_cluster_idx(),
N / snrt_cluster_num(), sizeof(double));
snrt_dma_wait_all();
snrt_mcycle();
}
snrt_cluster_hw_barrier();

// Free memory
#ifndef JOB_ARGS_PRELOADED
snrt_l1_update_next_v2(local_args);
#else
snrt_l1_update_next_v2(local_A);
#endif
}
36 changes: 3 additions & 33 deletions sw/apps/atax/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,46 +12,16 @@

int main() {
uint32_t nerr = 0;
double *local_A;
double *local_x;
double *local_y;
double *local_tmp;

// Allocate local variables
local_A = snrt_l1_next();
local_x = local_A + M * N;
local_y = local_x + N;
local_tmp = local_y + N;

// Initialize input matrices
if (snrt_is_dm_core()) {
snrt_dma_start_1d(local_A, A, sizeof(double) * M * N);
snrt_dma_start_1d(local_x, x, sizeof(double) * N);
snrt_dma_start_1d(local_y, (void *)snrt_zero_memory_ptr(),
sizeof(double) * N);
snrt_dma_start_1d(local_tmp, (void *)snrt_zero_memory_ptr(),
sizeof(double) * M);
snrt_dma_wait_all();
}
snrt_cluster_hw_barrier();

// Compute
kernel_atax(M, N, local_A, local_x, local_y, local_tmp);
snrt_cluster_hw_barrier();

// Writeback results
if (snrt_is_dm_core()) {
snrt_dma_start_1d(y, local_y, sizeof(double) * N);
snrt_dma_wait_all();
}
snrt_cluster_hw_barrier();
atax_args_t args = {M, N, (uint64_t)A, (uint64_t)x, (uint64_t)y};
atax_job(&args);

// Check computation is correct
#ifdef BIST
if (snrt_cluster_core_idx() == 0) {
// Check y
for (int i = 0; i < N; i++) {
double diff = fabs(golden[i] - local_y[i]);
double diff = fabs(golden[i] - y[i]);
if (diff > MAX_ERROR) {
nerr++;
}
Expand Down
10 changes: 6 additions & 4 deletions sw/apps/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ SECTION ?=
DATA_H := $($(APP)_BUILD_DIR)/data.h
DATAGEN_PY = $(SCRIPTS_DIR)/datagen.py

$(APP)_HEADERS := $(DATA_H)
$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR)
$(APP)_HEADERS := $(DATA_H)
$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR)
$(APP)_DATAGEN_ARGS += -c $($(APP)_DATA_CFG)
$(APP)_DATAGEN_ARGS += --section="$(SECTION)"

$(dir $(DATA_H)):
mkdir -p $@

$(DATA_H): DATA_CFG := $($(APP)_DATA_CFG)
$(DATA_H): DATAGEN_ARGS := $($(APP)_DATAGEN_ARGS)
$(DATA_H): $(DATAGEN_PY) $($(APP)_DATA_CFG) | $(dir $(DATA_H))
$< -c $(DATA_CFG) --section="$(SECTION)" $@
$< $(DATAGEN_ARGS) $@
17 changes: 16 additions & 1 deletion sw/apps/correlation/scripts/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Author: Jose Pedro Castro Fonseca <jose.pc.fonseca@gmail, [email protected]>
# Author: Jose Pedro Castro Fonseca <[email protected]>
# Luca Colagrande <[email protected]>

import numpy as np
Expand All @@ -21,9 +21,24 @@ class CorrelationDataGen(du.DataGen):
def golden_model(self, data):
return np.corrcoef(data, rowvar=False)

def validate(self, M, N, **kwargs):
assert (M % 8) == 0, "M must be an integer multiple of the number of cores"

# Calculate total TCDM occupation
data_size = N * M * 8
corr_size = M * M * 8
stddev_size = M * 8
total_size = data_size
total_size += corr_size
total_size += stddev_size
du.validate_tcdm_footprint(total_size)

def emit_header(self, **kwargs):
header = [super().emit_header()]

# Validate parameters
self.validate(**kwargs)

M, N = kwargs['M'], kwargs['N']
data = du.generate_random_array((N, M))
corr = self.golden_model(data)
Expand Down
15 changes: 15 additions & 0 deletions sw/apps/correlation/src/args.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Author: Luca Colagrande <[email protected]>

#pragma once
#include <stdint.h>

typedef struct {
uint32_t N;
uint32_t M;
uint64_t data_addr;
uint64_t corr_addr;
} correlation_args_t;
Loading

0 comments on commit 3a2d92e

Please sign in to comment.