Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add mlperf tiny snax-mlir kernel #31

Draft
wants to merge 3 commits into
base: chip_antwerp
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions target/sim/sw/device/apps/snax/snax-mlperf-tiny-ad01/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright 2023 KU Leuven.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Xiaoling Yi <[email protected]>

APP = snax-mlperf-tiny-ad01

INCDIRS = ../../../snax-mlir/runtime/include
INCDIRS += data

# Include the snax-mlir prebuilt binary in the final build
RISCV_LDFLAGS += bin/snax-mlperf-tiny-ad01.o

SRCS = src/snax-mlperf-tiny-ad01.c

include ./data/Makefile
include ../../common.mk


$(DEP): $(DATA_H)

build/snax-mlperf-tiny-ad01.part.elf: $(DEP) $(LD_SRCS) | $(BUILDDIR)
$(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@


Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
DATA_DIR := $(realpath $(MK_DIR))


DATA_H = $(DATA_DIR)/data.h


430 changes: 430 additions & 0 deletions target/sim/sw/device/apps/snax/snax-mlperf-tiny-ad01/data/data.h

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#include "stdint.h"

#include "data.h"
#include "memref.h"
#include "snax_rt.h"

/*
* These libraries are included from github.com/KULeuven-MICAS/snitch_cluster
* Interested users, might want to look at:
*
* /sw/snRuntime/api
* /target/snitch_cluster/sw/runtime/rtl/src
* /target/snitch_cluster/sw/runtime/common
* */
#include <snrt.h>

void _mlir_ciface_run_network(TwoDMemrefI8_t *output, TwoDMemrefI8_t *input);

void _mlir_ciface_snax_debug_gemm(int32_t _ptr_a, int32_t _ptr_b,
int32_t _ptr_c, int32_t when) {
int8_t *ptr_a, *ptr_b;
int32_t *ptr_c;
ptr_a = (int8_t *)_ptr_a;
ptr_b = (int8_t *)_ptr_b;
ptr_c = (int32_t *)_ptr_c;

int thisc = snrt_cluster_core_idx();

if (thisc == 0) {
printf("Debugging GeMM at t = %d with A at %p, B at %p, C at %p\r\n",
when, ptr_a, ptr_b, ptr_c);

for (int i = 0; i < 5; i++) {
printf("i%d -> A=%d, B=%d, C=%d\r\n", i, ptr_a[i], ptr_b[i],
ptr_c[i]);
}
}

for (uint8_t i = 0; i < 20; i++) {
if (thisc == i) {
printf("Core %d present.\r\n", thisc);
if (snrt_is_dm_core()) {
printf("I am a dm core\r\n");
}
}
snrt_cluster_hw_barrier();
}
}

void _mlir_ciface_snax_debug_bias(int32_t _ptr_a, int32_t _ptr_b,
int32_t _ptr_c, int32_t when) {
int32_t *ptr_a, *ptr_b, *ptr_c;
ptr_a = (int32_t *)_ptr_a;
ptr_b = (int32_t *)_ptr_b;
ptr_c = (int32_t *)_ptr_c;

int thisc = snrt_cluster_core_idx();
if (thisc == 0) {
printf("Debugging bias at t = %d with A at %p, B at %p, C at %p\r\n",
when, ptr_a, ptr_b, ptr_c);

for (int i = 0; i < 5; i++) {
printf("i%d -> A=%d, B=%d, C=%d\r\n", i, ptr_a[i], ptr_b[i],
ptr_c[i]);
}
}

for (uint8_t i = 0; i < 20; i++) {
if (thisc == i) {
printf("Core %d present.\r\n", thisc);
if (snrt_is_dm_core()) {
printf("I am a dm core\r\n");
}
}
snrt_cluster_hw_barrier();
}
}

void _mlir_ciface_snax_debug_simd(int32_t _ptr_a, int32_t _ptr_b,
int32_t _ptr_c, int32_t when) {
int32_t *ptr_a;
int8_t *ptr_c;
ptr_a = (int32_t *)_ptr_a;
ptr_c = (int8_t *)_ptr_c;

int thisc = snrt_cluster_core_idx();
if (thisc == 0) {
printf("Debugging SIMD at t = %d with A at %p, C at %p\r\n", when,
ptr_a, ptr_c);

for (int i = 0; i < 128; i++) {
printf("i%d -> A=%d, C=%d\r\n", i, ptr_a[i], ptr_c[i]);
}
}
for (uint8_t i = 0; i < 20; i++) {
if (thisc == i) {
printf("Core %d present.\r\n", thisc);
if (snrt_is_dm_core()) {
printf("I am a dm core\r\n");
}
}
snrt_cluster_hw_barrier();
}
}

int main() {
if (snrt_cluster_idx() == 0) {
if (snrt_cluster_core_idx() == 0) {
printf("hello from snitch cluster! running mlperf tiny \r\n");
}

// Create memref objects for data stored in L3
TwoDMemrefI8_t memrefA;
memrefA.data = &A;
memrefA.aligned_data = memrefA.data;
// Shape and Stride need to be defined for dynamic case
memrefA.shape[0] = 8;
memrefA.shape[1] = 640;
memrefA.stride[0] = 640;
memrefA.stride[1] = 1;
memrefA.offset = 0;

TwoDMemrefI8_t memrefB;

(void)snrt_mcycle();

_mlir_ciface_run_network(&memrefB, &memrefA);

snrt_cluster_hw_barrier();

(void)snrt_mcycle();

if (snrt_cluster_core_idx() == 0) {
printf("Got result at %p: \r\n", memrefB.aligned_data);
int8_t *data = memrefB.aligned_data;
for (int i = 0; i < 640; i++) {
printf("%d ", data[i]);
}
printf("\r\n");
}

snrt_cluster_hw_barrier();

return 0;
}
}
48 changes: 48 additions & 0 deletions target/sim/sw/device/snax-mlir/runtime/include/memref.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#pragma once

#include <stdint.h>

struct OneDMemrefI32 {
int32_t *data; // allocated pointer: Pointer to data buffer as allocated,
// only used for deallocating the memref
int32_t *aligned_data; // aligned pointer: Pointer to properly aligned data
// that memref indexes
uint32_t offset;
uint32_t shape[1];
uint32_t stride[1];
};

struct OneDMemrefI64 {
int64_t *data; // allocated pointer: Pointer to data buffer as allocated,
// only used for deallocating the memref
int64_t *aligned_data; // aligned pointer: Pointer to properly aligned data
// that memref indexes
uint32_t offset;
uint32_t shape[1];
uint32_t stride[1];
};

struct TwoDMemrefI32 {
int32_t *data; // allocated pointer: Pointer to data buffer as allocated,
// only used for deallocating the memref
int32_t *aligned_data; // aligned pointer: Pointer to properly aligned data
// that memref indexes
uint32_t offset;
uint32_t shape[2];
uint32_t stride[2];
};

struct TwoDMemrefI8 {
int8_t *data; // allocated pointer: Pointer to data buffer as allocated,
// only used for deallocating the memref
int8_t *aligned_data; // aligned pointer: Pointer to properly aligned data
// that memref indexes
uint32_t offset;
uint32_t shape[2];
uint32_t stride[2];
};

typedef struct OneDMemrefI32 OneDMemrefI32_t;
typedef struct OneDMemrefI64 OneDMemrefI64_t;
typedef struct TwoDMemrefI8 TwoDMemrefI8_t;
typedef struct TwoDMemrefI32 TwoDMemrefI32_t;
97 changes: 97 additions & 0 deletions target/sim/sw/device/snax-mlir/runtime/include/snax_rt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#pragma once

#include <snrt.h>
#include <stdint.h>

int8_t *allocated_pointer;

int8_t *_mlir_memref_to_llvm_alloc(uint32_t size) {
/* This calls malloc on the DMA core
* --> requires mlir opt to compile with:
* --convert-memref-to-llvm="use-generic-functions index-bitwidth=32"
* To ensure that all cores in the cluster come up with the correct
*/
if (snrt_is_dm_core()) {
allocated_pointer = (int8_t *)snrt_l1alloc(size);
}
snrt_cluster_hw_barrier();
return allocated_pointer;
};

typedef struct alloc_result {
void *pointer;
void *aligned_pointer;
} alloc_result_t;

alloc_result_t *allocated_result = (alloc_result_t *)0x10000000;

alloc_result_t *_mlir_ciface_snax_alloc_l1(uint32_t size, uint32_t alignment) {
if (snrt_is_dm_core()) {
void *next_ptr = snrt_l1_next();
// calculate extra size needed to allocate for correct alignment
uint32_t extra_size = alignment - ((int32_t)next_ptr % alignment);
void *allocated_pointer = snrt_l1alloc(size + extra_size);
void *aligned_pointer =
(void *)((int32_t)allocated_pointer + extra_size);

allocated_result->pointer = allocated_pointer;
allocated_result->aligned_pointer = aligned_pointer;
}

snrt_cluster_hw_barrier();

for (int i = 0; i < 3; i++) {
if (snrt_cluster_core_idx() == i) {
printf("Allocated result at %p is %p\r\n", allocated_result,
allocated_result->aligned_pointer);
}
snrt_cluster_hw_barrier();
}

return allocated_result;
}

void _mlir_ciface_snax_dump_l1() {
snrt_alloc_init();
// keep first 256 bytes free for zero wizardry
snrt_l1alloc(256);
// memset all to 0 with DMA
if (snrt_is_dm_core()) {
snrt_dma_start_2d((int32_t *)0x10000080, (int32_t *)0x10000040, 0x40,
0x40, 0, 0x1800);
snrt_dma_wait_all();
}
snrt_cluster_hw_barrier();
return;
}

void _mlir_ciface_snax_cluster_hw_barrier() {
snrt_cluster_hw_barrier();
return;
}

void _mlir_ciface_snax_dma_1d_transfer(size_t *source, size_t *destination,
size_t size) {
// printf("Copying %d bytes from %p to %p\n", size, (void *)source,
// (void *)destination);
snrt_dma_start_1d((void *)destination, (void *)source, size);
snrt_dma_wait_all();
return;
}

void _mlir_ciface_snax_dma_2d_transfer(size_t *source, size_t *destination,
size_t size, size_t src_stride,
size_t dst_stride, size_t repeat) {
// printf("Copying %d bytes from %p to %p, stridsrc %x stridedst %x rpt
// %d\n",
// size, source, destination, src_stride, dst_stride, repeat);
snrt_dma_start_2d((void *)destination, (void *)source, size, dst_stride,
src_stride, repeat);
snrt_dma_wait_all();
}

int _mlir_ciface_snax_is_dm_core() { return snrt_is_dm_core(); }

int _mlir_ciface_snax_is_compute_core() {
return (snrt_cluster_core_idx() == 0);
}
3 changes: 2 additions & 1 deletion target/sim/sw/host/apps/offload/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ DEVICE_APPS += snax/snax-data-reshuffler
DEVICE_APPS += snax/snax-streamer-gemm-conv
DEVICE_APPS += snax/snax-streamer-gemm-conv-simd
DEVICE_APPS += snax/snax-test-integration
DEVICE_APPS += snax/snax-mlperf-tiny-ad01
# Dependencies
INCDIRS += $(RUNTIME_DIR)
INCDIRS += $(HOST_DIR)/../shared/platform/generated
Expand Down Expand Up @@ -145,4 +146,4 @@ $(BUILDDIR)/$(APP)-%.dwarf: $(BUILDDIR)/$(APP)-%.elf | $(BUILDDIR)

ifneq ($(MAKECMDGOALS),clean)
-include $(DEP)
endif
endif
Loading