From 1712415f095a3ab0735db806fd3aad70028ba66d Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Fri, 5 Jul 2024 17:44:31 +0200 Subject: [PATCH] [cheshire] Back-ref sw compilation flow for fmatmul --- cheshire/README.md | 10 +- cheshire/sw/Makefile | 32 ++++-- cheshire/sw/README.md | 6 +- cheshire/sw/encoding.h | 1 - cheshire/sw/{ => include}/cheshire_util.h | 4 +- cheshire/sw/include/encoding.h | 1 + cheshire/sw/include/fmatmul.c.h | 1 + cheshire/sw/include/fmatmul.h | 1 + cheshire/sw/{ => include}/vector_util.h | 22 ++++ cheshire/sw/src/fmatmul.c | 126 ++++++++++++++++++++++ cheshire/sw/{ => src}/vector_helloworld.c | 0 11 files changed, 184 insertions(+), 20 deletions(-) delete mode 120000 cheshire/sw/encoding.h rename cheshire/sw/{ => include}/cheshire_util.h (90%) create mode 120000 cheshire/sw/include/encoding.h create mode 120000 cheshire/sw/include/fmatmul.c.h create mode 120000 cheshire/sw/include/fmatmul.h rename cheshire/sw/{ => include}/vector_util.h (54%) create mode 100644 cheshire/sw/src/fmatmul.c rename cheshire/sw/{ => src}/vector_helloworld.c (100%) diff --git a/cheshire/README.md b/cheshire/README.md index d59cae828..5ea78bc44 100644 --- a/cheshire/README.md +++ b/cheshire/README.md @@ -8,7 +8,7 @@ Support for FPGA synthesis was added to Ara by integrating it into Cheshire. Sin 1. **Navigate to the Root Directory** Ensure you are in the root directory where the Makefile is located. - + 2. **Set up environment** Set the `BACKREF_CHS_ROOT` variable to root directory of the Cheshire repository where you want to build the bitstream. @@ -26,17 +26,17 @@ This command will: Here's how we use back-referencing in our setup: 1. **Generate Custom TCL File**: - + - We generate a custom `add_sources.vcu128.tcl` file using the `bender script vivado` command with our specific targets (`-t fpga -t cv64a6_imafdcv_sv39 -t cva6 -t vcu128 --define ARA`). - This custom TCL file includes all the necessary sources and configurations required for the FPGA synthesis with Cheshire + Ara. 2. **Copy Custom TCL File**: - + - The generated custom TCL file is then copied into the Cheshire directory (`$(BACKREF_CHS_XIL_SCRIPTS)/add_sources.vcu128.tcl`). 3. **Invoke Cheshire Compile Flow**: - + - With the custom TCL file in place, we invoke the Cheshire compile flow by running `make -C $(BACKREF_CHS_ROOT) chs-xilinx-all`. - The Cheshire compile flow target depends on the `add_sources.vcu128.tcl` file, and since we have provided our custom version, it will use ours for the synthesis process. -This method ensures that we can extend and customize the compile flow for our specific needs without modifying the Cheshire repository directly. +This method ensures that we can extend and customize the compile flow for our specific needs without modifying the Cheshire repository directly. \ No newline at end of file diff --git a/cheshire/sw/Makefile b/cheshire/sw/Makefile index 56bd291cc..0194ece01 100644 --- a/cheshire/sw/Makefile +++ b/cheshire/sw/Makefile @@ -6,22 +6,34 @@ # # Copy and compile vector software on Cheshire -CHS_ROOT ?= $(realpath ../../../../../..) -ARA_SW := $(dir $(realpath $(firstword $(MAKEFILE_LIST)))) +CHS_ROOT ?= $(dir $(realpath $(firstword $(MAKEFILE_LIST))))/../../../../../.. +ARA_ROOT := $(dir $(realpath $(firstword $(MAKEFILE_LIST))))/../.. CHS_SW := $(CHS_ROOT)/sw -SRC := $(wildcard $(ARA_SW)/*.c) $(wildcard $(ARA_SW)/*.h) +ARA_SW := $(ARA_ROOT)/cheshire/sw +ARA_APPS := $(ARA_ROOT)/apps + +APPS := $(patsubst $(ARA_APPS)/%/main.c,%,$(shell find $(ARA_APPS) -name "main.c")) +SW_C := $(wildcard $(ARA_SW)/src/*.c) +DEPS_H := $(wildcard $(ARA_SW)/include/*.h) + +ARA_CONFIGURATION ?= 2_lanes +include $(ARA_ROOT)/config/$(ARA_CONFIGURATION).mk # Get the original compiler options and add the support for vector extension CHS_SW_FLAGS ?= $(shell grep "^CHS_SW_FLAGS\s\+?=\s\+" -- $(CHS_SW)/sw.mk | sed 's/^.*?= //' | sed s/rv64gc/rv64gcv/) +# Tweak the compilation to include Cheshire-related headers and files +CHS_SW_FLAGS += -DCHESHIRE -DNR_LANES=$(nr_lanes) -DVLEN=$(vlen) -.PHONY: chs-sw-all copy_vector_sw +.PHONY: chs-sw-all copy_vector_sw copy-vector-deps # Forward build command to the main Cheshire makefile and attach the correct -march -# Rename the .c vector files not to break the cheshire vanilla flow -chs-sw-all: copy-vector-sw +chs-sw-all: copy-vector-sw copy-vector-deps make -C $(CHS_ROOT) $@ CHS_SW_FLAGS="$(CHS_SW_FLAGS)" - for f in $(filter %.c, $(SRC)); do mv $(CHS_SW)/tests/$f $(CHS_SW)/tests/$f.bkp; done -# Copy the vector programs to cheshire -copy-vector-sw: - cp $(SRC) $(CHS_SW)/tests +# Copy the dependencies from this folder to Cheshire +copy-vector-deps: $(DEPS_H) + cp $^ $(CHS_SW)/tests + +# Copy the vector programs from the src folder to cheshire +copy-vector-sw: $(SW_C) + cp $^ $(CHS_SW)/tests diff --git a/cheshire/sw/README.md b/cheshire/sw/README.md index e4be744d2..3eb04fb86 100644 --- a/cheshire/sw/README.md +++ b/cheshire/sw/README.md @@ -1,9 +1,11 @@ # Build software for Cheshire Ara -Compile the `.c` programs in this folder with: +## Compile the vector code for Cheshire + +Compile the source files with the vector extension support enable: ```bash make chs-sw-all ``` -This command will copy the necessary source files into Cheshire's `sw/tests` directory and compile them with the support for vector extension. \ No newline at end of file +This command will also copy the necessary dependencies to `sw/tests` and enable the vector extension at compile time. diff --git a/cheshire/sw/encoding.h b/cheshire/sw/encoding.h deleted file mode 120000 index d2d456631..000000000 --- a/cheshire/sw/encoding.h +++ /dev/null @@ -1 +0,0 @@ -../../apps/common/encoding.h \ No newline at end of file diff --git a/cheshire/sw/cheshire_util.h b/cheshire/sw/include/cheshire_util.h similarity index 90% rename from cheshire/sw/cheshire_util.h rename to cheshire/sw/include/cheshire_util.h index ca1bd5b29..9d57d7cc4 100644 --- a/cheshire/sw/cheshire_util.h +++ b/cheshire/sw/include/cheshire_util.h @@ -11,14 +11,14 @@ #include "printf.h" -inline void cheshire_start() { +void cheshire_start() { // Initialize Cheshire's UART uint32_t rtc_freq = *reg32(&__base_regs, CHESHIRE_RTC_FREQ_REG_OFFSET); uint64_t reset_freq = clint_get_core_freq(rtc_freq, 2500); uart_init(&__base_uart, reset_freq, __BOOT_BAUDRATE); } -inline void cheshire_finish() { +void cheshire_end() { // Flush teh UART uart_write_flush(&__base_uart); } diff --git a/cheshire/sw/include/encoding.h b/cheshire/sw/include/encoding.h new file mode 120000 index 000000000..674da338d --- /dev/null +++ b/cheshire/sw/include/encoding.h @@ -0,0 +1 @@ +../../../apps/common/encoding.h \ No newline at end of file diff --git a/cheshire/sw/include/fmatmul.c.h b/cheshire/sw/include/fmatmul.c.h new file mode 120000 index 000000000..1aa8fb602 --- /dev/null +++ b/cheshire/sw/include/fmatmul.c.h @@ -0,0 +1 @@ +../../../apps/fmatmul/kernel/fmatmul.c \ No newline at end of file diff --git a/cheshire/sw/include/fmatmul.h b/cheshire/sw/include/fmatmul.h new file mode 120000 index 000000000..928a355e6 --- /dev/null +++ b/cheshire/sw/include/fmatmul.h @@ -0,0 +1 @@ +../../../apps/fmatmul/kernel/fmatmul.h \ No newline at end of file diff --git a/cheshire/sw/vector_util.h b/cheshire/sw/include/vector_util.h similarity index 54% rename from cheshire/sw/vector_util.h rename to cheshire/sw/include/vector_util.h index 9526ffb66..77e032ef6 100644 --- a/cheshire/sw/vector_util.h +++ b/cheshire/sw/include/vector_util.h @@ -13,9 +13,31 @@ #include #include "encoding.h" +#define start_timer() +#define stop_timer() +#define get_timer() 0 + +#define FABS(x) ((x < 0) ? -x : x) + inline void enable_rvv() { asm volatile ("li t0, %0" :: "i"(MSTATUS_VS)); asm volatile ("csrs mstatus, t0" ); } +inline int similarity_check(double a, double b, double threshold) { + double diff = a - b; + if (FABS(diff) > threshold) + return 0; + else + return 1; +} + +inline int similarity_check_32b(float a, float b, float threshold) { + float diff = a - b; + if (FABS(diff) > threshold) + return 0; + else + return 1; +} + #endif diff --git a/cheshire/sw/src/fmatmul.c b/cheshire/sw/src/fmatmul.c new file mode 100644 index 000000000..54ec72c02 --- /dev/null +++ b/cheshire/sw/src/fmatmul.c @@ -0,0 +1,126 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Matteo Perotti +// +// fmatmul wrapper for Cheshire + +#include "regs/cheshire.h" +#include "dif/clint.h" +#include "dif/uart.h" +#include "params.h" +#include "util.h" + +#include "cheshire_util.h" +#include "vector_util.h" + +#include "fmatmul.c.h" + +#ifndef _MM_SIZE_ +#define _MM_SIZE_ 32 +#endif + +// Define Matrix dimensions: +// C = AB with A=[MxN], B=[NxP], C=[MxP] +uint64_t M = _MM_SIZE_; +uint64_t N = _MM_SIZE_; +uint64_t P = _MM_SIZE_; + +// Max matrix size: 256x256 +double a[_MM_SIZE_*_MM_SIZE_] __attribute__((aligned(32 * NR_LANES))); +double b[_MM_SIZE_*_MM_SIZE_] __attribute__((aligned(32 * NR_LANES))); +double c[_MM_SIZE_*_MM_SIZE_] __attribute__((aligned(32 * NR_LANES))); +// Gold results +double g[_MM_SIZE_*_MM_SIZE_] __attribute__((aligned(32 * NR_LANES))); + +#define THRESHOLD 0.001 + +// Verify the matrix +int verify_matrix(double *result, double *gold, size_t R, size_t C, + double threshold) { + for (uint64_t i = 0; i < R; ++i) { + for (uint64_t j = 0; j < C; ++j) { + int idx = i * C + j; + if (!similarity_check(result[idx], gold[idx], threshold)) { + return (i + j) == 0 ? -1 : idx; + } + } + } + return 0; +} + +int main() { + printf("\n"); + printf("=============\n"); + printf("= FMATMUL =\n"); + printf("=============\n"); + printf("\n"); + printf("------------------------------------------------------------\n"); + printf("Calculating a (%d x %d) x (%d x %d) matrix multiplication...\n", M, + N, N, P); + printf("------------------------------------------------------------\n"); + printf("\n"); + + cheshire_start(); + enable_rvv(); + + unsigned int s = M; + + // Initialize matrices + for (unsigned int i = 0; i < s; ++i) { + for (unsigned int k = 0; k < s; ++k) { + a[k + i*s] = (double) (i + k); + } + } + for (unsigned int k = 0; k < s; ++k) { + for (unsigned int j = 0; j < s; ++j) { + b[j + k*s] = (double) (k - j); + } + } + + // Run scalar check + printf("Calculating fmatmul on scalar core...\n"); + for (unsigned int i = 0; i < s; ++i) { + for (unsigned int j = 0; j < s; ++j) { + double sum = 0; + for (unsigned int k = 0; k < s; ++k) { + sum += a[k + i * s] * b[j + k * s]; + } + g[i + j*s] = sum; + } + } + + // Run vector kernel + printf("Calculating fmatmul on vector core...\n"); + start_timer(); + fmatmul(c, a, b, s, s, s); + stop_timer(); + + // Metrics + int64_t runtime = get_timer(); + float performance = 2.0 * s * s * s / runtime; + float utilization = 100 * performance / (2.0 * NR_LANES); + + printf("The execution took %d cycles.\n", runtime); + printf("The performance is %f FLOP/cycle (%f%% utilization).\n", + performance, utilization); + + // Verify the result only for s == M (to keep it simple) + if (s == M) { + printf("Verifying result...\n"); + int error = verify_matrix(c, g, s, s, THRESHOLD); + if (error != 0) { + printf("Error code %d\n", error); + printf("c[%d]=%d\n", error, c[error]); + return error; + } else { + printf("Passed.\n"); + } + } + + + cheshire_end(); + + return 0; +} diff --git a/cheshire/sw/vector_helloworld.c b/cheshire/sw/src/vector_helloworld.c similarity index 100% rename from cheshire/sw/vector_helloworld.c rename to cheshire/sw/src/vector_helloworld.c