Skip to content

Commit

Permalink
Move args into tcdm before timing, simplify l1ptr calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
rogerbarton committed Jan 17, 2024
1 parent b29937d commit 459f106
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 40 deletions.
2 changes: 2 additions & 0 deletions sw/blas/gemm/src/gemm_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
// Luca Bertaccini <[email protected]>
// Luca Colagrande <[email protected]>

#pragma once

#include <stdint.h>

#include "snrt.h"
Expand Down
53 changes: 22 additions & 31 deletions sw/blas/gemm/src/gemm_occamy_2dpipe.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include <stdint.h>
#include <string.h>

#include "gemm.h"
#include "gemm_kernel.h"
#include "snrt.h"

#include "dump.h"
Expand Down Expand Up @@ -110,20 +110,7 @@ void gemm_oc_opt2d(double alpha, double beta, uint32_t m, uint32_t n,

// Setup layout for TCDM L1
// For double buffering l1 is a size 2 array
TcdmLayout* l1 = (TcdmLayout*)snrt_l1_next();
TcdmLayout* l1Addr[SNRT_CLUSTER_NUM] = {0};

// Sync l1 pointers between clusters
if (snrt_is_dm_core()) {
l1AddrGlobal[p[1]] = l1;
}
snrt_global_barrier();
if (snrt_is_dm_core()) {
for (int i = 0; i < SNRT_CLUSTER_NUM; ++i)
l1Addr[i] = l1 + cluster_offset * (i - snrt_cluster_idx());

// memcpy(l1Addr, l1AddrGlobal, SNRT_CLUSTER_NUM * sizeof(*l1Addr));
}
TcdmLayout* l1 = snrt_l1_next();

bool l1Id_AB = false;
bool l1Id_C = false;
Expand All @@ -139,35 +126,39 @@ void gemm_oc_opt2d(double alpha, double beta, uint32_t m, uint32_t n,
int ib_prev, jb_prev, kb_prev;
bool ib_dir = false, jb_dir = false, kb_dir = false;

bool storeC = false;

// Debug
volatile int ib_cnt = 0, jb_cnt = 0, kb_cnt = 0;

if (snrt_is_compute_core()) {
snrt_global_barrier(); // DMA core is one index ahead
}
bool storeC = false;

// Compute C2C sources for 2D pipeline
volatile const uint32_t pk =
(PI + 2 * PJ - pi - pj - 1) % PJ; // pipeline step
int PK = PJ; // pipeline depth
// -- Compute C2C sources for 2D pipeline
const uint32_t pk = (PI + 2 * PJ - pi - pj - 1) % PJ; // pipeline step
const int PK = PJ; // pipeline depth

// Determine C2C source cluster index for each matrix, < 0 is from DRAM
TcdmLayout* c2cL1_A = NULL;
TcdmLayout* c2cL1_B = NULL;
if (snrt_is_dm_core()) {
dump_pk(pk);

const bool fetch_dram = pk == 0;
// -- Sync l1 pointers between clusters
TcdmLayout* l1Ptr[SNRT_CLUSTER_NUM];
for (int i = 0; i < SNRT_CLUSTER_NUM; ++i)
l1Ptr[i] = l1 + cluster_offset * (i - snrt_cluster_idx());

// 2D pipeline indices, see notes or python notebook for details
// Works for PI = PJ
volatile const uint32_t p_srcA = pi * PJ + ((2 * PJ - pi - pk) % PJ);
volatile const uint32_t p_srcB = pj + PJ * ((2 * PJ - pj - pk) % PJ);
dump_p_src(fetch_dram ? -1 : p_srcA);
dump_p_src(fetch_dram ? -1 : p_srcB);

c2cL1_A = fetch_dram ? NULL : l1Addr[p_srcA];
c2cL1_B = fetch_dram ? NULL : l1Addr[p_srcB];
const bool fetch_dram = pk == 0;
c2cL1_A = fetch_dram ? NULL : l1Ptr[p_srcA];
c2cL1_B = fetch_dram ? NULL : l1Ptr[p_srcB];

// dump_p_src(fetch_dram ? -1 : p_srcA);
// dump_p_src(fetch_dram ? -1 : p_srcB);
}

if (snrt_is_compute_core()) {
snrt_global_barrier(); // DMA core is one index ahead
}

// Wait for pipeline to be filled
Expand Down
33 changes: 24 additions & 9 deletions sw/blas/gemm/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,30 @@ NAMED_DUMP(uint32_t, err, 0x7)

int main() {
const bool setup_ssr = true;
uint32_t start_cycle = snrt_mcycle();

// load into TCDM
uint32_t data_M = M;
uint32_t data_N = N;
uint32_t data_K = K;
uint32_t data_TA = TA;
uint32_t data_TB = TB;
double data_BETA = BETA;
uint32_t data_dtype_size = dtype_size;
uint32_t data_expand = expand;
double* data_a = a;
double* data_b = b;
double* data_c = c;

uint32_t lda = K;
uint32_t ldb = N;
uint32_t ldc = N;

gemm_oc(dtype_size, expand, setup_ssr, TA, TB, M, N, K, 1, a, lda, b, ldb,
BETA, c, ldc);

uint32_t end_cycle = snrt_mcycle();
for (volatile int i = 2; i > 0; --i) {
if (i == 1) snrt_mcycle(); // start
gemm_oc(data_dtype_size, data_expand, setup_ssr, data_TA, data_TB, data_M, data_N, data_K, 1,
data_a, lda, data_b, ldb, data_BETA, data_c, ldc);
if (i == 1) snrt_mcycle(); // end
}

snrt_fpu_fence();
snrt_global_barrier();
Expand All @@ -39,10 +53,11 @@ int main() {
uint32_t errors = M * N;

if (snrt_global_core_idx() == 0) {
for (uint32_t m = 0; m < M; m++) {
for (uint32_t n = 0; n < N; n++) {
uint32_t idx = m * N + n;
if (fabs(result[idx] - c[idx]) < 0.001) errors--;
for (uint32_t i = 0; i < M; i++) {
for (uint32_t j = 0; j < N; j++) {
uint32_t idx = i * N + j;
if (fabs(result[idx] - c[idx]) < 0.001)
errors--;
}
}
// printf("%d/%d Errors\n", errors, M * N);
Expand Down

0 comments on commit 459f106

Please sign in to comment.