From 3c73ab9ed1f3518ca50902e06ac07579c6288069 Mon Sep 17 00:00:00 2001 From: Luca Bertaccini <55843305+lucabertaccini@users.noreply.github.com> Date: Wed, 10 Jan 2024 15:00:40 +0100 Subject: [PATCH] hw: Bump FPU, update FP latencies, fix low-precision FP GEMM (#66) * Remove IF statement preventing from executing low-precision SIMD GEMMs * Align FP latencies with Occamy configuration * Fix disassembly generation for Snitch custom instructions * Fix gemm.c transpose check * gemm: Correct BIST check * TODO: squash with 2ccb6a77287be01be5eb275bd39e041f1cfc9aa2 * Change threshold to tolerate precision losses in large low-precision GEMMs * Bump FPU to pulp-v0.1.3 * Check relative errors in GEMMs * Fix bug in GEMM --------- Co-authored-by: Luca Colagrande --- Bender.yml | 2 +- hw/snitch_cluster/src/snitch_cc.sv | 1 + hw/snitch_cluster/src/snitch_fp_ss.sv | 2 ++ hw/snitch_cluster/src/snitch_fpu.sv | 11 ++++++++--- sw/blas/gemm/src/gemm.h | 6 +++--- sw/blas/gemm/src/main.c | 20 +++++++++++--------- target/snitch_cluster/cfg/default.hjson | 4 ++-- target/snitch_cluster/sw/apps/common.mk | 2 +- target/snitch_cluster/sw/toolchain.mk | 4 ++++ 9 files changed, 33 insertions(+), 19 deletions(-) diff --git a/Bender.yml b/Bender.yml index 4ae9d28e9..84dad47e8 100644 --- a/Bender.yml +++ b/Bender.yml @@ -22,7 +22,7 @@ dependencies: axi: { git: https://github.com/pulp-platform/axi, version: 0.39.0 } axi_riscv_atomics: { git: https://github.com/pulp-platform/axi_riscv_atomics, version: 0.6.0 } common_cells: { git: https://github.com/pulp-platform/common_cells, version: 1.28.0 } - FPnew: { git: https://github.com/openhwgroup/cvfpu, rev: 1202ca3 } # TODO: feature branch `feature/expanding_sdotp`; get merged! + FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 } register_interface: { git: https://github.com/pulp-platform/register_interface, version: 0.4.2 } tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version: 0.2.11 } riscv-dbg: { git: https://github.com/pulp-platform/riscv-dbg, version: 0.8.0 } diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv index 2d38c63b3..5bb3b1b48 100644 --- a/hw/snitch_cluster/src/snitch_cc.sv +++ b/hw/snitch_cluster/src/snitch_cc.sv @@ -487,6 +487,7 @@ module snitch_cc #( .trace_port_o ( fpu_trace ), .sequencer_tracer_port_o ( fpu_sequencer_trace ), // pragma translate_on + .hart_id_i ( hart_id_i ), .acc_req_i ( acc_snitch_req ), .acc_req_valid_i ( acc_qvalid ), .acc_req_ready_o ( acc_qready ), diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv index 0b994e19e..fc75a386c 100644 --- a/hw/snitch_cluster/src/snitch_fp_ss.sv +++ b/hw/snitch_cluster/src/snitch_fp_ss.sv @@ -42,6 +42,7 @@ module snitch_fp_ss import snitch_pkg::*; #( output fpu_trace_port_t trace_port_o, output fpu_sequencer_trace_port_t sequencer_tracer_port_o, // pragma translate_on + input logic [31:0] hart_id_i, // Accelerator Interface - Slave input acc_req_t acc_req_i, input logic acc_req_valid_i, @@ -2509,6 +2510,7 @@ module snitch_fp_ss import snitch_pkg::*; #( ) i_fpu ( .clk_i , .rst_ni ( ~rst_i ), + .hart_id_i ( hart_id_i ), .operands_i ( op ), .rnd_mode_i ( fpu_rnd_mode ), .op_i ( fpu_op ), diff --git a/hw/snitch_cluster/src/snitch_fpu.sv b/hw/snitch_cluster/src/snitch_fpu.sv index 44d28df45..ed7958edc 100644 --- a/hw/snitch_cluster/src/snitch_fpu.sv +++ b/hw/snitch_cluster/src/snitch_fpu.sv @@ -19,6 +19,7 @@ module snitch_fpu import snitch_pkg::*; #( input logic clk_i, input logic rst_ni, // Input signals + input logic [31:0] hart_id_i, input logic [2:0][FLEN-1:0] operands_i, input fpnew_pkg::roundmode_e rnd_mode_i, input fpnew_pkg::operation_e op_i, @@ -99,12 +100,15 @@ module snitch_fpu import snitch_pkg::*; #( fpnew_top #( // FPU configuration - .Features ( FPUFeatures ), - .Implementation ( FPUImplementation ), - .TagType ( logic[6:0] ) + .Features ( FPUFeatures ), + .Implementation ( FPUImplementation ), + .TagType ( logic[6:0] ), + .CompressedVecCmpResult ( 1 ), + .StochasticRndImplementation ( fpnew_pkg::DEFAULT_RSR ) ) i_fpu ( .clk_i , .rst_ni , + .hart_id_i ( hart_id_i ), .operands_i ( fpu_in_q.operands ), .rnd_mode_i ( fpu_in_q.rnd_mode ), .op_i ( fpu_in_q.op ), @@ -114,6 +118,7 @@ module snitch_fpu import snitch_pkg::*; #( .int_fmt_i ( fpu_in_q.int_fmt ), .vectorial_op_i ( fpu_in_q.vectorial_op ), .tag_i ( fpu_in_q.tag ), + .simd_mask_i ( '1 ), .in_valid_i ( in_valid_q ), .in_ready_o ( in_ready_q ), .flush_i ( 1'b0 ), diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h index e46a328a8..336b47b9a 100644 --- a/sw/blas/gemm/src/gemm.h +++ b/sw/blas/gemm/src/gemm.h @@ -929,9 +929,9 @@ void gemm(precision_t prec, uint32_t expand, uint32_t setup_ssr, } break; case FP8: - gemm_fp8_ex_opt(frac_m, n, k, (char*)a + offsetA, lda, (char*)b, - ldb, (char*)c + offsetC, ldc_strided, &beta, - setup_ssr); + gemm_fp8_ex_opt(frac_m, n, k, (char*)a + offsetA, lda_strided, + (char*)b, ldb, (char*)c + offsetC, ldc_strided, + &beta, setup_ssr); break; } } diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c index 33779abf7..e189f65d0 100644 --- a/sw/blas/gemm/src/main.c +++ b/sw/blas/gemm/src/main.c @@ -50,15 +50,15 @@ int main() { uint32_t start_cycle = snrt_mcycle(); volatile uint32_t lda = K; - volatile uint32_t ldb = N; + volatile uint32_t ldb = K; volatile uint32_t ldc = N; // Transpose of A unsopported if (TA) return -1; - if (TB) { + if (!TB) { // Transpose of B supported only in FP64 if (dtype_size != FP64) return -1; - ldb = K; + ldb = N; } gemm(dtype_size, expand, setup_ssr, TA, TB, frac_m, N, K, 1, local_a, @@ -75,6 +75,8 @@ int main() { snrt_dma_wait_all(); } + snrt_cluster_hw_barrier(); + // TODO: currently only works for single cluster otherwise need to // synchronize all cores here #ifdef BIST @@ -86,19 +88,19 @@ int main() { uint32_t idx = m * N + n; switch (dtype_size) { case FP64: - if (fabs(result[idx] - ((double *)local_c)[idx]) > - 0.001) + if (fabs(result[idx] - ((double *)local_c)[idx]) < + fabs(result[idx] * 0.00001)) errors--; break; case FP32: - if (fabs(result[idx] - ((float *)local_c)[idx]) > 0.001) + if (fabs(result[idx] - ((float *)local_c)[idx]) < + fabs(result[idx] * 0.0001)) errors--; break; case FP16: - if (fabs(result[idx] - ((__fp16 *)local_c)[idx]) > - 0.001) + if (fabs(result[idx] - ((__fp16 *)local_c)[idx]) < + fabs(result[idx] * 0.005)) errors--; - break; case FP8: printf("No golden model yet for fp8!\n"); return -1; diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson index c39c2a490..7f28a1073 100644 --- a/target/snitch_cluster/cfg/default.hjson +++ b/target/snitch_cluster/cfg/default.hjson @@ -34,8 +34,8 @@ lat_comp_fp8: 1, lat_comp_fp8_alt: 1, lat_noncomp: 1, - lat_conv: 1, - lat_sdotp: 2, + lat_conv: 2, + lat_sdotp: 3, fpu_pipe_config: "BEFORE" narrow_xbar_latency: "CUT_ALL_PORTS", wide_xbar_latency: "CUT_ALL_PORTS", diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk index 15538616e..e27a19cfd 100644 --- a/target/snitch_cluster/sw/apps/common.mk +++ b/target/snitch_cluster/sw/apps/common.mk @@ -82,7 +82,7 @@ $(ELF): $(SRCS) $(DEP) $(LIBS) | $(BUILDDIR) $(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@ $(DUMP): $(ELF) | $(BUILDDIR) - $(RISCV_OBJDUMP) -D $< > $@ + $(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) $< > $@ $(DWARF): $(ELF) | $(BUILDDIR) $(RISCV_DWARFDUMP) $< > $@ diff --git a/target/snitch_cluster/sw/toolchain.mk b/target/snitch_cluster/sw/toolchain.mk index ca5f6a482..3d50974b8 100644 --- a/target/snitch_cluster/sw/toolchain.mk +++ b/target/snitch_cluster/sw/toolchain.mk @@ -55,3 +55,7 @@ RISCV_LDFLAGS += -lclang_rt.builtins-riscv32 # Archiver flags RISCV_ARFLAGS = rcs + +# Objdump flags +RISCV_OBJDUMP_FLAGS += --mcpu=snitch +RISCV_OBJDUMP_FLAGS += -D