Skip to content

Commit

Permalink
clang-format
Browse files Browse the repository at this point in the history
  • Loading branch information
rogerbarton committed Jan 15, 2024
1 parent 68d5e3f commit 220a908
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 167 deletions.
96 changes: 50 additions & 46 deletions sw/blas/gemm/src/gemm_occamy_1dpipe.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

#pragma once

Check failure on line 3 in sw/blas/gemm/src/gemm_occamy_1dpipe.h

View workflow job for this annotation

GitHub Actions / Check License headers

FAILED: First comment ended before licence notice

#include <stdint.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>

#include "snrt.h"
#include "gemm.h"
#include "snrt.h"

#include "dump.h"
NAMED_DUMP(uint32_t, aIdx, 0x1a)
Expand All @@ -20,29 +20,32 @@ NAMED_DUMP(double, a, 0xa)
NAMED_DUMP(double, b, 0xb)
NAMED_DUMP(double, c, 0xc)

#define MIN(a,b) ((a)<(b)?(a):(b))
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

/**
* \brief Implements a reversing loop for an index range
* \param begin Beginning of the range
* \param end End of the range
* \param dir Sets the direction of traversal. True: loop starts at begin.
* \param i_prev Set the previous index to the first index, must update this manually at the end of the loop.
* \details i_end_floor will contain the exact end with the stride, s.t. the reversed loop starts at the correct index.
* \param i_prev Set the previous index to the first index, must update this
* manually at the end of the loop. \details i_end_floor will contain the exact
* end with the stride, s.t. the reversed loop starts at the correct index.
*/
#define FOR_EACH(i, begin, end, stride, dir, i_prev) \
dir = !dir; \
const int i##_end_floor = ((end - begin + stride - 1) / stride) * stride - stride + begin; \
const int i##_first = dir ? begin : i##_end_floor; \
const int i##_last = dir ? i##_end_floor : begin; \
i = i##_first; \
i_prev = i; \
for (; dir ? i <= i##_last : i >= i##_last; i = dir ? i + stride : i - stride)

#define L1_M 8 //128;
#define L1_N 8 //128;
#define L1_K 8 //128;
#define FOR_EACH(i, begin, end, stride, dir, i_prev) \
dir = !dir; \
const int i##_end_floor = \
((end - begin + stride - 1) / stride) * stride - stride + begin; \
const int i##_first = dir ? begin : i##_end_floor; \
const int i##_last = dir ? i##_end_floor : begin; \
i = i##_first; \
i_prev = i; \
for (; dir ? i <= i##_last : i >= i##_last; \
i = dir ? i + stride : i - stride)

#define L1_M 8 // 128;
#define L1_N 8 // 128;
#define L1_K 8 // 128;
#define L1_LDA L1_K
#define L1_LDB L1_N
#define L1_LDC L1_N
Expand All @@ -60,14 +63,14 @@ NAMED_DUMP(TcdmLayout*, l1, 0x8)

TcdmLayout* l1AddrGlobal[SNRT_CLUSTER_NUM] = {0};

void gemm_oc_opt1d(double alpha, double beta,
uint32_t m, uint32_t n, uint32_t k,
double* A, double* B, double* C,
uint32_t lda, uint32_t ldb, uint32_t ldc) {
void gemm_oc_opt1d(double alpha, double beta, uint32_t m, uint32_t n,
uint32_t k, double* A, double* B, double* C, uint32_t lda,
uint32_t ldb, uint32_t ldc) {
/**
* Problem is double buffered in L1. The buffer that is used is toggled at each iteration.
* The DMA cores are one index step ahead so they load the data in advance into the buffer that will be used.
*/
* Problem is double buffered in L1. The buffer that is used is toggled at
* each iteration. The DMA cores are one index step ahead so they load the
* data in advance into the buffer that will be used.
*/

volatile uint32_t p[3] = {0, 0, 0};
volatile uint32_t P[3] = {0, 0, 0};
Expand All @@ -76,12 +79,11 @@ void gemm_oc_opt1d(double alpha, double beta,

// Setup layout for TCDM L1
// For double buffering l1 is a size 2 array
TcdmLayout* l1 = (TcdmLayout*) snrt_l1_next();
TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next();
TcdmLayout* l1Addr[SNRT_CLUSTER_NUM] = {0};

// Sync l1 pointers between clusters
if (snrt_is_dm_core())
l1AddrGlobal[p[1]] = l1;
if (snrt_is_dm_core()) l1AddrGlobal[p[1]] = l1;
snrt_global_barrier();
if (snrt_is_dm_core()) {
memcpy(l1Addr, l1AddrGlobal, SNRT_CLUSTER_NUM * sizeof(*l1Addr));
Expand All @@ -91,7 +93,7 @@ void gemm_oc_opt1d(double alpha, double beta,
}

bool l1Id_AB = false;
bool l1Id_C = false;
bool l1Id_C = false;

// Initialize indices
const uint32_t I = m, J = n, K = k;
Expand All @@ -110,20 +112,19 @@ void gemm_oc_opt1d(double alpha, double beta,
volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0;

if (snrt_is_compute_core()) {
snrt_global_barrier(); // DMA core is one index ahead
snrt_global_barrier(); // DMA core is one index ahead
}

// -- Compute C2C sources for 2D pipeline
volatile const uint32_t pk = MAX(pi, pj); // pipeline step
int PK = MAX(PI, PJ); // pipeline depth
volatile const uint32_t pk = MAX(pi, pj); // pipeline step
int PK = MAX(PI, PJ); // pipeline depth

// Determine C2C source cluster index for each matrix, < 0 is from DRAM
int pIdx_A = pj - 1;
int pIdx_B = pi - 1;
TcdmLayout* const p_srcA = pIdx_A == 0 ? NULL : l1Addr[pIdx_A - 1];
TcdmLayout* const p_srcB = pIdx_B == 0 ? NULL : l1Addr[pIdx_B - 1];


// Wait for pipeline to be filled
for (int pipeline = pk; pipeline > 0; --pipeline) {
snrt_global_barrier();
Expand All @@ -140,8 +141,7 @@ void gemm_oc_opt1d(double alpha, double beta,
dump_ib(ib);
dump_jb(jb);
snrt_dma_load_2d_tile(l1_C, C, ib, jb, L1_M, L1_N, ldc, FP64);
if (ib != ib_first || jb != jb_first)
storeC = true;
if (ib != ib_first || jb != jb_first) storeC = true;
}

FOR_EACH(kb, 0, K / L1_K, 1, kb_dir, kb_prev) {
Expand All @@ -153,54 +153,58 @@ void gemm_oc_opt1d(double alpha, double beta,
if (snrt_is_dm_core()) {
// TODO: use multicast instead
if (p_srcA == NULL)
snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda, FP64);
snrt_dma_load_2d_tile(l1_A, A, ib, kb, L1_M, L1_K, lda,
FP64);
else {
double* const c2c_A = p_srcA[l1Id_AB].A;
snrt_dma_start_1d(l1_A, c2c_A, L1_M * L1_K * FP64);
}
if (p_srcB == NULL)
snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb, FP64);
snrt_dma_load_2d_tile(l1_B, B, kb, jb, L1_K, L1_N, ldb,
FP64);
else {
double* const c2c_B = p_srcB[l1Id_AB].B;
snrt_dma_start_1d(l1_B, c2c_B, L1_K * L1_N * FP64);
}

snrt_dma_wait_all();
} else {
// solve block already in l1, parallelize inside each cluster
gemm(FP64, 0, true, false, false,
L1_M, L1_N, L1_K, alpha,
// solve block already in l1, parallelize inside each
// cluster
gemm(FP64, 0, true, false, false, L1_M, L1_N, L1_K, alpha,
l1_A, L1_LDA, l1_B, L1_LDB, beta, l1_C, L1_LDC);
}

l1Id_AB = !l1Id_AB; // switch buffers
l1Id_AB = !l1Id_AB; // switch buffers
snrt_global_barrier();

if (snrt_is_dm_core()) {
if (storeC) {
storeC = false;
snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev,
jb_prev, L1_M, L1_N, ldc, FP64);
}
}
kb_prev = kb;
}

l1Id_C = !l1Id_C; // switch buffers
l1Id_C = !l1Id_C; // switch buffers
jb_prev = jb;
ib_prev = ib;
}
}

if (snrt_is_dm_core()) {
snrt_global_barrier(); // DMA core is one index ahead
snrt_global_barrier(); // DMA core is one index ahead

// store final tile
snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N, ldc, FP64);
snrt_dma_store_2d_tile(C, l1[!l1Id_C].C, ib_prev, jb_prev, L1_M, L1_N,
ldc, FP64);
snrt_dma_wait_all();
}

// Wait for pipeline to be emptied
for (int pipeline = pk; pipeline < PK -1; ++pipeline) {
for (int pipeline = pk; pipeline < PK - 1; ++pipeline) {
snrt_global_barrier();
}
}
Expand Down
Loading

0 comments on commit 220a908

Please sign in to comment.