Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sw: Add GEMV, K-Means and update ATAX, Correlation, Covariance #190

Merged
merged 11 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pyflexfloat
pytablewriter
pytest
pyyaml
scikit-learn
tabulate
termcolor
yamllint
Expand Down
22 changes: 18 additions & 4 deletions sw/apps/atax/scripts/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Author: Jose Pedro Castro Fonseca <jose.pc.fonseca@gmail, [email protected]>
# Author: Jose Pedro Castro Fonseca <[email protected]>
# Luca Colagrande <[email protected]>

import numpy as np
Expand All @@ -21,17 +21,31 @@ class AtaxDataGen(du.DataGen):
def golden_model(self, A, x):
return np.matmul(A.transpose(), np.matmul(A, x))

def validate(self, M, N, **kwargs):
assert (N % 8) == 0, "N must be an integer multiple of the number of cores"

# Calculate total TCDM occupation
a_size = M * N * 8
x_size = N * 8
y_size = N * 8
tmp_size = M * 8
total_size = a_size
total_size += x_size
total_size += y_size
total_size += tmp_size
du.validate_tcdm_footprint(total_size)

def emit_header(self, **kwargs):
header = [super().emit_header()]

# Validate parameters
self.validate(**kwargs)

M, N = kwargs['M'], kwargs['N']
A = du.generate_random_array((M, N))
x = du.generate_random_array((N, 1))
y = self.golden_model(A, x)

assert (M % 8) == 0, "M must be an integer multiple of the number of cores"
assert (N % 8) == 0, "N must be an integer multiple of the number of cores"

A = A.flatten()
x = x.flatten()
y = y.flatten()
Expand Down
16 changes: 16 additions & 0 deletions sw/apps/atax/src/args.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Author: Luca Colagrande <[email protected]>

#pragma once
#include <stdint.h>

typedef struct {
uint32_t M;
uint32_t N;
uint64_t A_addr;
uint64_t x_addr;
uint64_t y_addr;
} atax_args_t;
103 changes: 87 additions & 16 deletions sw/apps/atax/src/atax.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,114 @@
// Luca Colagrande <[email protected]>

#include <stdint.h>
#include "args.h"
#include "blas.h"
#include "snrt.h"

void kernel_atax(uint32_t M, uint32_t N, double *A, double *x, double *y,
double *tmp) {
static inline void atax(uint32_t M, uint32_t N, double *A, double *x, double *y,
double *tmp) {
double tmp_fs;
int core_range, core_offset;
int core_range, core_offset, cluster_core_offset;

// tmp = A * x
if (snrt_is_compute_core()) {
core_range = M / snrt_cluster_compute_core_num();
core_offset = snrt_cluster_core_idx() * core_range;
for (int i1 = 0; i1 < core_range; i1++) {
int i = core_offset + i1;
tmp_fs = 0.0;
for (int j = 0; j < N; j++) {
tmp_fs += A[i * N + j] * x[j];
}
tmp[i] = tmp_fs;
}
snrt_mcycle();
gemv(0, M, N, 1, A, x, 1, tmp);
snrt_mcycle();
}

snrt_cluster_hw_barrier();

// y = At * tmp
if (snrt_is_compute_core()) {
core_range = N / snrt_cluster_compute_core_num();
core_offset = snrt_cluster_core_idx() * core_range;
snrt_mcycle();
core_range = N / snrt_global_compute_core_num();
core_offset = snrt_global_compute_core_idx() * core_range;
cluster_core_offset = snrt_cluster_core_idx() * core_range;
for (int j1 = 0; j1 < core_range; j1++) {
int j = core_offset + j1;
int cluster_j = cluster_core_offset + j1;
tmp_fs = 0.0;
for (int i = 0; i < M; i++) {
// The order of the for loops was exchanged, so that each loop
// reduces in y at position j, iterating through the i
// positions.
tmp_fs += A[i * N + j] * tmp[i];
}
y[j] = tmp_fs;
y[cluster_j] = tmp_fs;
}
snrt_fpu_fence();
snrt_mcycle();
}
}

void atax_job(void *args) {
double *local_A;
double *local_x;
double *local_y;
double *local_tmp;
atax_args_t *local_args;

#ifndef JOB_ARGS_PRELOADED
// Allocate space for job arguments in TCDM
local_args = (atax_args_t *)snrt_l1_alloc_cluster_local(sizeof(atax_args_t),
sizeof(double));

// Copy job arguments to TCDM
if (snrt_is_dm_core()) {
snrt_dma_start_1d(local_args, args, sizeof(atax_args_t));
snrt_dma_wait_all();
}
snrt_cluster_hw_barrier();
#else
local_args = (atax_args_t *)args;
#endif

// Aliases
uint32_t M = local_args->M;
uint32_t N = local_args->N;
double *A = (double *)(local_args->A_addr);
double *x = (double *)(local_args->x_addr);
double *y = (double *)(local_args->y_addr);

// Allocate local variables
size_t size_A = M * N * sizeof(double);
size_t size_x = N * sizeof(double);
size_t size_y = N * sizeof(double);
size_t size_tmp = M * sizeof(double);
size_t size_y_tile = size_y / snrt_cluster_num();
local_A = snrt_l1_alloc_cluster_local(size_A, sizeof(double));
local_x = snrt_l1_alloc_cluster_local(size_x, sizeof(double));
local_y = snrt_l1_alloc_cluster_local(size_y_tile, sizeof(double));
local_tmp = snrt_l1_alloc_cluster_local(size_tmp, sizeof(double));

// Initialize input matrices
if (snrt_is_dm_core()) {
snrt_dma_start_1d(local_A, A, size_A);
snrt_dma_start_1d(local_x, x, size_x);
snrt_dma_wait_all();
}
snrt_mcycle();
snrt_cluster_hw_barrier();

// Compute
atax(M, N, local_A, local_x, local_y, local_tmp);
snrt_cluster_hw_barrier();
snrt_mcycle();

// Writeback results
if (snrt_is_dm_core()) {
snrt_dma_store_1d_tile(y, local_y, snrt_cluster_idx(),
N / snrt_cluster_num(), sizeof(double));
snrt_dma_wait_all();
snrt_mcycle();
}
snrt_cluster_hw_barrier();

// Free memory
#ifndef JOB_ARGS_PRELOADED
snrt_l1_update_next_v2(local_args);
#else
snrt_l1_update_next_v2(local_A);
#endif
}
36 changes: 3 additions & 33 deletions sw/apps/atax/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,46 +12,16 @@

int main() {
uint32_t nerr = 0;
double *local_A;
double *local_x;
double *local_y;
double *local_tmp;

// Allocate local variables
local_A = snrt_l1_next();
local_x = local_A + M * N;
local_y = local_x + N;
local_tmp = local_y + N;

// Initialize input matrices
if (snrt_is_dm_core()) {
snrt_dma_start_1d(local_A, A, sizeof(double) * M * N);
snrt_dma_start_1d(local_x, x, sizeof(double) * N);
snrt_dma_start_1d(local_y, (void *)snrt_zero_memory_ptr(),
sizeof(double) * N);
snrt_dma_start_1d(local_tmp, (void *)snrt_zero_memory_ptr(),
sizeof(double) * M);
snrt_dma_wait_all();
}
snrt_cluster_hw_barrier();

// Compute
kernel_atax(M, N, local_A, local_x, local_y, local_tmp);
snrt_cluster_hw_barrier();

// Writeback results
if (snrt_is_dm_core()) {
snrt_dma_start_1d(y, local_y, sizeof(double) * N);
snrt_dma_wait_all();
}
snrt_cluster_hw_barrier();
atax_args_t args = {M, N, (uint64_t)A, (uint64_t)x, (uint64_t)y};
atax_job(&args);

// Check computation is correct
#ifdef BIST
if (snrt_cluster_core_idx() == 0) {
// Check y
for (int i = 0; i < N; i++) {
double diff = fabs(golden[i] - local_y[i]);
double diff = fabs(golden[i] - y[i]);
if (diff > MAX_ERROR) {
nerr++;
}
Expand Down
10 changes: 6 additions & 4 deletions sw/apps/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ SECTION ?=
DATA_H := $($(APP)_BUILD_DIR)/data.h
DATAGEN_PY = $(SCRIPTS_DIR)/datagen.py

$(APP)_HEADERS := $(DATA_H)
$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR)
$(APP)_HEADERS := $(DATA_H)
$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR)
$(APP)_DATAGEN_ARGS += -c $($(APP)_DATA_CFG)
$(APP)_DATAGEN_ARGS += --section="$(SECTION)"

$(dir $(DATA_H)):
mkdir -p $@

$(DATA_H): DATA_CFG := $($(APP)_DATA_CFG)
$(DATA_H): DATAGEN_ARGS := $($(APP)_DATAGEN_ARGS)
$(DATA_H): $(DATAGEN_PY) $($(APP)_DATA_CFG) | $(dir $(DATA_H))
$< -c $(DATA_CFG) --section="$(SECTION)" $@
$< $(DATAGEN_ARGS) $@
17 changes: 16 additions & 1 deletion sw/apps/correlation/scripts/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Author: Jose Pedro Castro Fonseca <jose.pc.fonseca@gmail, [email protected]>
# Author: Jose Pedro Castro Fonseca <[email protected]>
# Luca Colagrande <[email protected]>

import numpy as np
Expand All @@ -21,9 +21,24 @@ class CorrelationDataGen(du.DataGen):
def golden_model(self, data):
return np.corrcoef(data, rowvar=False)

def validate(self, M, N, **kwargs):
assert (M % 8) == 0, "M must be an integer multiple of the number of cores"

# Calculate total TCDM occupation
data_size = N * M * 8
corr_size = M * M * 8
stddev_size = M * 8
total_size = data_size
total_size += corr_size
total_size += stddev_size
du.validate_tcdm_footprint(total_size)

def emit_header(self, **kwargs):
header = [super().emit_header()]

# Validate parameters
self.validate(**kwargs)

M, N = kwargs['M'], kwargs['N']
data = du.generate_random_array((N, M))
corr = self.golden_model(data)
Expand Down
15 changes: 15 additions & 0 deletions sw/apps/correlation/src/args.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Author: Luca Colagrande <[email protected]>

#pragma once
#include <stdint.h>

typedef struct {
uint32_t N;
uint32_t M;
uint64_t data_addr;
uint64_t corr_addr;
} correlation_args_t;
Loading
Loading