Skip to content

Commit

Permalink
ci: Correct linting
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Aug 20, 2024
1 parent 120fea8 commit 5401e44
Show file tree
Hide file tree
Showing 10 changed files with 114 additions and 161 deletions.
3 changes: 2 additions & 1 deletion sw/apps/covariance/src/args.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
#include <stdint.h>

typedef void (*covariance_fp_t)(uint32_t m, uint32_t n, double inv_n,
double inv_n_m1, double *data, double *datat,double *cov);
double inv_n_m1, double *data, double *datat,
double *cov);

typedef struct {
uint32_t m;
Expand Down
118 changes: 49 additions & 69 deletions sw/apps/covariance/src/covariance.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,13 @@

#define DOUBLE_BUFFER 1

void covariance_naive(uint32_t m, uint32_t n, double inv_n,
double inv_n_m1, double *data, double *datat,
double *cov) {
void covariance_naive(uint32_t m, uint32_t n, double inv_n, double inv_n_m1,
double *data, double *datat, double *cov) {
uint32_t offset = snrt_cluster_core_idx();
uint32_t stride = snrt_cluster_compute_core_num();

// Center data
for (uint32_t i = offset; i < m; i += stride) {

// Calculate row mean
double data_mean = 0.0;
double datat_mean = 0.0;
Expand All @@ -44,15 +42,13 @@ void covariance_naive(uint32_t m, uint32_t n, double inv_n,
syrk_naive(m, n, inv_n_m1, data, datat, 0, cov);
}

void covariance_baseline(uint32_t m, uint32_t n, double inv_n,
double inv_n_m1, double *data, double *datat,
double *cov) {
void covariance_baseline(uint32_t m, uint32_t n, double inv_n, double inv_n_m1,
double *data, double *datat, double *cov) {
uint32_t offset = snrt_cluster_core_idx();
uint32_t stride = snrt_cluster_compute_core_num();

// Center data
for (uint32_t i = offset; i < m; i += stride) {

// Calculate row mean
double data_mean = 0.0;
double datat_mean = 0.0;
Expand All @@ -77,9 +73,8 @@ void covariance_baseline(uint32_t m, uint32_t n, double inv_n,
syrk_baseline(m, n, inv_n_m1, data, datat, 0, cov);
}

void covariance_opt(uint32_t m, uint32_t n, double inv_n,
double inv_n_m1, double *data, double *datat,
double *cov) {
void covariance_opt(uint32_t m, uint32_t n, double inv_n, double inv_n_m1,
double *data, double *datat, double *cov) {
uint32_t offset = snrt_cluster_core_idx();
uint32_t stride = snrt_cluster_compute_core_num();

Expand All @@ -97,14 +92,14 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
// ft0.push(data[i * n + j])
// ft1.push(datat[i * n + j])
const uint32_t ssr01_b[4] = {unroll0, n, 2, m / (stride * unroll0)};
const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double),
0, sizeof(double) * n * stride * unroll0};
snrt_ssr_loop_4d(SNRT_SSR_DM0,
ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3],
ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]);
snrt_ssr_loop_4d(SNRT_SSR_DM1,
ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3],
ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]);
const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double), 0,
sizeof(double) * n * stride * unroll0};
snrt_ssr_loop_4d(SNRT_SSR_DM0, ssr01_b[0], ssr01_b[1], ssr01_b[2],
ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2],
ssr01_i[3]);
snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr01_b[0], ssr01_b[1], ssr01_b[2],
ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2],
ssr01_i[3]);
snrt_ssr_repeat(SNRT_SSR_DM0, 1);
// Configure ft2 to store data and datat elements
// for (i1 = offset; i1 < m; i1 += stride * unroll0)
Expand All @@ -115,11 +110,9 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
// datat[i * n + j] = ft2.pop()
const uint32_t ssr2_b[4] = {2, unroll0, n, m / (stride * unroll0)};
const uint32_t ssr2_i[4] = {(uint32_t)datat - (uint32_t)data,
sizeof(double) * n * stride,
sizeof(double),
sizeof(double) * n * stride, sizeof(double),
sizeof(double) * n * stride * unroll0};
snrt_ssr_loop_4d(SNRT_SSR_DM2,
ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3],
snrt_ssr_loop_4d(SNRT_SSR_DM2, ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3],
ssr2_i[0], ssr2_i[1], ssr2_i[2], ssr2_i[3]);

// SSR start address need to be configured each time
Expand All @@ -130,21 +123,20 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,

// Center data
for (uint32_t i = offset; i < m; i += stride * unroll0) {

// Calculate row means
double m[2 * unroll0];
m[0] = 0.0; // mean(data[i])
m[1] = 0.0; // mean(datat[i])
m[2] = 0.0; // mean(data[i + stride])
m[3] = 0.0; // mean(datat[i + stride])
m[0] = 0.0; // mean(data[i])
m[1] = 0.0; // mean(datat[i])
m[2] = 0.0; // mean(data[i + stride])
m[3] = 0.0; // mean(datat[i + stride])
asm volatile(
"frep.o %[n_frep], %[n_insn], 0, 0 \n"
"fadd.d %[m0], ft0, %[m0] \n"
"fadd.d %[m1], ft1, %[m1] \n"
"fadd.d %[m2], ft0, %[m2] \n"
"fadd.d %[m3], ft1, %[m3] \n"
: [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]),
[ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3])
: [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]),
[ m3 ] "+f"(m[3])
: [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0)
: "ft0", "ft1", "ft2");
m[0] *= inv_n;
Expand All @@ -161,8 +153,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
"fsub.d ft2, ft1, %[m1] \n"
"fsub.d ft2, ft0, %[m2] \n"
"fsub.d ft2, ft1, %[m3] \n"
: [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]),
[ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3])
: [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]),
[ m3 ] "+f"(m[3])
: [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0)
: "ft0", "ft1", "ft2");
}
Expand Down Expand Up @@ -190,16 +182,16 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
// ft0.push(a[i * n + k])
// ft1.push(at[j * n + k])
const uint32_t ssr0_b[4] = {unroll1, n, m / unroll1, m / stride};
const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)};
snrt_ssr_loop_3d(SNRT_SSR_DM0,
ssr0_b[1], ssr0_b[2], ssr0_b[3],
ssr0_i[1], ssr0_i[2], ssr0_i[3]);
const uint32_t ssr0_i[4] = {0, sizeof(double), 0,
stride * n * sizeof(double)};
snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1],
ssr0_i[2], ssr0_i[3]);
snrt_ssr_repeat(SNRT_SSR_DM0, unroll1);
const uint32_t ssr1_b[4] = {unroll1, n, m / unroll1, m / stride};
const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll1 * n * sizeof(double), 0};
snrt_ssr_loop_4d(SNRT_SSR_DM1,
ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3],
ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double),
unroll1 * n * sizeof(double), 0};
snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3],
ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);

// SSR start address need to be configured each time
snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, data + offset * n);
Expand All @@ -208,7 +200,6 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,

for (uint32_t i = offset; i < m; i += stride) {
for (uint32_t j = 0; j < m; j += unroll1) {

double acc[unroll1];
acc[0] = 0;
acc[1] = 0;
Expand All @@ -227,8 +218,10 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
"fmul.d %[b3], %[acc3], %[alpha] \n"
: [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
[ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]),
[ b0 ] "=f"(cov[i * m + j + 0]), [ b1 ] "=f"(cov[i * m + j + 1]),
[ b2 ] "=f"(cov[i * m + j + 2]), [ b3 ] "=f"(cov[i * m + j + 3])
[ b0 ] "=f"(cov[i * m + j + 0]),
[ b1 ] "=f"(cov[i * m + j + 1]),
[ b2 ] "=f"(cov[i * m + j + 2]),
[ b3 ] "=f"(cov[i * m + j + 3])
: [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1),
[ alpha ] "f"(inv_n_m1)
: "ft0", "ft1", "ft2");
Expand All @@ -241,8 +234,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,

void covariance_job(covariance_args_t *args) {
uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes;
uint64_t local_a0_addr, local_at0_addr, local_b0_addr,
local_a1_addr, local_at1_addr, local_b1_addr;
uint64_t local_a0_addr, local_at0_addr, local_b0_addr, local_a1_addr,
local_at1_addr, local_b1_addr;
double *local_a[2];
double *local_at[2];
double *local_b[2];
Expand Down Expand Up @@ -287,12 +280,13 @@ void covariance_job(covariance_args_t *args) {

// Calculate number of iterations
sb_iterations = args->m_tiles * args->m_tiles;
if (DOUBLE_BUFFER) iterations = sb_iterations + 2;
else iterations = sb_iterations;
if (DOUBLE_BUFFER)
iterations = sb_iterations + 2;
else
iterations = sb_iterations;

// Iterate over all tiles
for (i = 0; i < iterations; i++) {

if (snrt_is_dm_core()) {
// DMA in
if (!DOUBLE_BUFFER || (i < sb_iterations)) {
Expand All @@ -305,18 +299,10 @@ void covariance_job(covariance_args_t *args) {
i_col = i_dma_in % args->m_tiles;

// Copy job operands in TCDM
snrt_dma_load_1d_tile(
local_a[buff_idx],
args->data,
i_row,
a_tile_size,
sizeof(double));
snrt_dma_load_1d_tile(
local_at[buff_idx],
args->data,
i_col,
a_tile_size,
sizeof(double));
snrt_dma_load_1d_tile(local_a[buff_idx], args->data, i_row,
a_tile_size, sizeof(double));
snrt_dma_load_1d_tile(local_at[buff_idx], args->data, i_col,
a_tile_size, sizeof(double));
snrt_dma_wait_all();

snrt_mcycle();
Expand All @@ -343,15 +329,9 @@ void covariance_job(covariance_args_t *args) {
i_col = i_dma_out % args->m_tiles;

// Copy job outputs from TCDM
snrt_dma_store_2d_tile(
args->cov,
local_b[buff_idx],
i_row,
i_col,
m_frac,
m_frac,
args->m,
sizeof(double));
snrt_dma_store_2d_tile(args->cov, local_b[buff_idx], i_row,
i_col, m_frac, m_frac, args->m,
sizeof(double));
snrt_dma_wait_all();

snrt_mcycle();
Expand Down
1 change: 0 additions & 1 deletion sw/apps/covariance/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include "data.h"

int main() {

covariance_job(&args);

return 0;
Expand Down
9 changes: 5 additions & 4 deletions sw/blas/axpy/src/args.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
#pragma once
#include <stdint.h>

typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y, double* z);
typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y,
double* z);

typedef struct {
uint32_t n;
double a;
double *x;
double *y;
double *z;
double* x;
double* y;
double* z;
uint32_t n_tiles;
axpy_fp_t funcptr;
} axpy_args_t;
40 changes: 20 additions & 20 deletions sw/blas/axpy/src/axpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
#define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT)
#define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT)

static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double* z) {
static inline void axpy_naive(uint32_t n, double a, double *x, double *y,
double *z) {
int core_idx = snrt_cluster_core_idx();
int frac = n / snrt_cluster_compute_core_num();
int offset = core_idx;
Expand All @@ -22,28 +23,27 @@ static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double
snrt_fpu_fence();
}

static inline void axpy_fma(uint32_t n, double a, double* x, double* y, double* z) {
static inline void axpy_fma(uint32_t n, double a, double *x, double *y,
double *z) {
int core_idx = snrt_cluster_core_idx();
int frac = n / snrt_cluster_compute_core_num();
int offset = core_idx;

for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) {
asm volatile (
"fmadd.d %[z], %[a], %[x], %[y] \n"
: [ z ]"=f"(z[i])
: [ a ]"f"(a), [ x ]"f"(x[i]), [ y ]"f"(y[i])
);
asm volatile("fmadd.d %[z], %[a], %[x], %[y] \n"
: [ z ] "=f"(z[i])
: [ a ] "f"(a), [ x ] "f"(x[i]), [ y ] "f"(y[i]));
}
snrt_fpu_fence();
}

static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* z) {
static inline void axpy_opt(uint32_t n, double a, double *x, double *y,
double *z) {
int core_idx = snrt_cluster_core_idx();
int frac = n / snrt_cluster_compute_core_num();
int offset = core_idx;

snrt_ssr_loop_1d(SNRT_SSR_DM_ALL,
frac,
snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac,
snrt_cluster_compute_core_num() * sizeof(double));

snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x + offset);
Expand All @@ -57,24 +57,22 @@ static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double*
"fmadd.d ft2, %[a], ft0, ft1\n"
:
: [ n_frep ] "r"(frac - 1), [ a ] "f"(a)
: "ft0", "ft1", "ft2", "memory"
);

: "ft0", "ft1", "ft2", "memory");

snrt_fpu_fence();
snrt_ssr_disable();
}

static inline void axpy_job(axpy_args_t *args) {
uint32_t frac, offset, size;
uint64_t local_x0_addr, local_y0_addr, local_z0_addr,
local_x1_addr, local_y1_addr, local_z1_addr;
uint64_t local_x0_addr, local_y0_addr, local_z0_addr, local_x1_addr,
local_y1_addr, local_z1_addr;
double *local_x[2];
double *local_y[2];
double *local_z[2];
double *remote_x, *remote_y, *remote_z;
uint32_t iterations, i, i_dma_in, i_compute, i_dma_out, buff_idx;


#ifndef JOB_ARGS_PRELOADED
// Allocate space for job arguments in TCDM
axpy_args_t *local_args = (axpy_args_t *)snrt_l1_next();
Expand Down Expand Up @@ -102,8 +100,10 @@ static inline void axpy_job(axpy_args_t *args) {
local_z[0] = (double *)local_z0_addr;
if (DOUBLE_BUFFER) {
local_x1_addr = ALIGN_UP_TCDM(local_z0_addr + size);
local_y1_addr = ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT;
local_z1_addr = ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT;
local_y1_addr =
ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT;
local_z1_addr =
ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT;
local_x[1] = (double *)local_x1_addr;
local_y[1] = (double *)local_y1_addr;
local_z[1] = (double *)local_z1_addr;
Expand All @@ -115,7 +115,6 @@ static inline void axpy_job(axpy_args_t *args) {

// Iterate over all tiles
for (i = 0; i < iterations; i++) {

if (snrt_is_dm_core()) {
// DMA in
if (!DOUBLE_BUFFER || (i < args->n_tiles)) {
Expand Down Expand Up @@ -176,7 +175,8 @@ static inline void axpy_job(axpy_args_t *args) {

// Perform tile computation
axpy_fp_t fp = args->funcptr;
fp(frac, args->a, local_x[buff_idx], local_y[buff_idx], local_z[buff_idx]);
fp(frac, args->a, local_x[buff_idx], local_y[buff_idx],
local_z[buff_idx]);

snrt_mcycle();
}
Expand Down
1 change: 0 additions & 1 deletion sw/blas/axpy/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include "data.h"

int main() {

axpy_job(&args);

// TODO: currently only works for single cluster otherwise need to
Expand Down
Loading

0 comments on commit 5401e44

Please sign in to comment.