Skip to content

Commit

Permalink
[software] Add f32 and f16 dotp app
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Sep 5, 2024
1 parent e77933c commit c346c1c
Show file tree
Hide file tree
Showing 10 changed files with 848 additions and 123 deletions.
78 changes: 78 additions & 0 deletions software/apps/baremetal/dotp_f16/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

// Author: Marco Bertuletti, ETH Zurich

#include <stdint.h>
#include <string.h>

#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"

#include "data_dotp_f16.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
// #define SINGLE_CORE_REDUCTION
#define BINARY_REDUCTION

// Vectors for kernel computation
__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
uint32_t red_barrier[NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));
__fp16 sum[2 * NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));

#include "baremetal/mempool_dotp_f16.h"

int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
uint32_t time_init, time_end;
mempool_barrier_init(core_id);

time_init = 0;
time_end = 0;
if (core_id == 0) {
dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t));
dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t));
}
for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) {
sum[k] = 0;
red_barrier[k] = 0;
}
mempool_barrier(num_cores);

// // SINGLE-CORE
// time_init = mempool_get_timer();
// dotp_f16s(l1_A, l1_B, sum, LEN);
// // dotp_f16s_unrolled4(l1_A, l1_B, sum, LEN);
// time_end = mempool_get_timer();

// // PARALLEL
// time_init = mempool_get_timer();
// dotp_f16vecp_unrolled4(l1_A, l1_B, sum, LEN, num_cores);
// // dotp_f16p(l1_A, l1_B, sum, LEN, num_cores);
// time_end = mempool_get_timer();

// PARALLEL, LOCAL ACCESSES
time_init = mempool_get_timer();
dotp_f16vecp_local_unrolled4(l1_A, l1_B, sum, LEN);
time_end = mempool_get_timer();

// Check results
mempool_barrier(num_cores);
if (core_id == 0) {
uint32_t clock_cycles = (time_end - time_init);
printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
printf("Result ==> %x\n", *(uint32_t *)&sum[0]);
printf("Check ==> %x\n\n", *(uint32_t *)&l2_C);
}
mempool_barrier(num_cores);

return 0;
}
76 changes: 76 additions & 0 deletions software/apps/baremetal/dotp_f32/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

// Author: Marco Bertuletti, ETH Zurich

#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"

#include "data_dotp_f32.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
// #define SINGLE_CORE_REDUCTION
#define BINARY_REDUCTION

// Vectors for kernel computation
float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
uint32_t red_barrier[NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));
float sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

#include "baremetal/mempool_dotp_f32.h"

int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
uint32_t time_init, time_end;
mempool_barrier_init(core_id);

time_init = 0;
time_end = 0;
if (core_id == 0) {
dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t));
dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t));
}
for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) {
sum[k] = 0;
red_barrier[k] = 0;
}
mempool_barrier(num_cores);

// // SINGLE-CORE
// time_init = mempool_get_timer();
// dotp_f32s_unrolled4(l1_A, l1_B, sum, LEN);
// time_end = mempool_get_timer();

// // PARALLEL
// time_init = mempool_get_timer();
// dotp_f32p(l1_A, l1_B, sum, LEN, num_cores);
// time_end = mempool_get_timer();

// PARALLEL, LOCAL ACCESSES
time_init = mempool_get_timer();
dotp_f32p_local_unrolled4(l1_A, l1_B, sum, LEN);
time_end = mempool_get_timer();

// Check results
mempool_barrier(num_cores);
if (core_id == 0) {
uint32_t clock_cycles = (time_end - time_init);
printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
printf("Result ==> %d\n", sum[0]);
printf("Check ==> %d\n\n", l2_C);
}
mempool_barrier(num_cores);

return 0;
}
3 changes: 1 addition & 2 deletions software/apps/baremetal/dotp_i32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ uint32_t red_barrier[NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));
int32_t sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

#include "baremetal/mempool_dotp_i32p.h"
#include "baremetal/mempool_dotp_i32s.h"
#include "baremetal/mempool_dotp_i32.h"

int main() {

Expand Down
24 changes: 24 additions & 0 deletions software/data/data_dotp_f16.h.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright 2022 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
\
<% def array_to_cstr(array):
out = '{'
i = 0
out += '\n'
for a in array:
out += '(__fp16){:.4f}, '.format(a)
i += 1
if i % 8 == 0:
out += '\n'
out = out[:-2] + '}'
return out
%> \

#define LEN (${Len})

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = (__fp16)${C}f;
24 changes: 24 additions & 0 deletions software/data/data_dotp_f32.h.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright 2022 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
\
<% def array_to_cstr(array):
out = '{'
i = 0
out += '\n'
for a in array:
out += '{}f, '.format(a)
i += 1
if i % 8 == 0:
out += '\n'
out = out[:-2] + '}'
return out
%> \

#define LEN (${Len})

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = ${C}f;
38 changes: 38 additions & 0 deletions software/data/generate_dotp.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,24 @@ def generate_dotp_i32(Len):
C = np.dot(A, B)
return A, B, C


def generate_dotp_f32(Len):

# Create matrix
A = np.random.rand(Len).astype(np.float32)
B = np.random.rand(Len).astype(np.float32)
C = (np.dot(A, B)).astype(np.float32)
return A, B, C


def generate_dotp_f16(Len):

# Create matrix
A = np.random.rand(Len).astype(np.float16)
B = np.random.rand(Len).astype(np.float16)
C = (np.dot(A, B)).astype(np.float16)
return A, B, C

##################
# compute_result #
##################
Expand Down Expand Up @@ -72,6 +90,26 @@ def main():
'Len': Len}
gen_data_header_file(args.outdir, tpl, **kwargs)

A, B, C = generate_dotp_f32(Len)
tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f32.h.tpl"
kwargs = {
'name': 'data_dotp_f32',
'A': A,
'B': B,
'C': C,
'Len': Len}
gen_data_header_file(args.outdir, tpl, **kwargs)

A, B, C = generate_dotp_f16(Len)
tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f16.h.tpl"
kwargs = {
'name': 'data_dotp_f16',
'A': A,
'B': B,
'C': C,
'Len': Len}
gen_data_header_file(args.outdir, tpl, **kwargs)


if __name__ == "__main__":
main()
Loading

0 comments on commit c346c1c

Please sign in to comment.