Skip to content

Commit

Permalink
hw: Bump FPU, update FP latencies, fix low-precision FP GEMM (#66)
Browse files Browse the repository at this point in the history
* Remove IF statement preventing from executing low-precision SIMD GEMMs

* Align FP latencies with Occamy configuration

* Fix disassembly generation for Snitch custom instructions

* Fix gemm.c transpose check

* gemm: Correct BIST check

* TODO: squash with 2ccb6a7

* Change threshold to tolerate precision losses in large low-precision GEMMs

* Bump FPU to pulp-v0.1.3

* Check relative errors in GEMMs

* Fix bug in GEMM

---------

Co-authored-by: Luca Colagrande <[email protected]>
  • Loading branch information
lucabertaccini and colluca authored Jan 10, 2024
1 parent 0c226e2 commit 3c73ab9
Show file tree
Hide file tree
Showing 9 changed files with 33 additions and 19 deletions.
2 changes: 1 addition & 1 deletion Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies:
axi: { git: https://github.com/pulp-platform/axi, version: 0.39.0 }
axi_riscv_atomics: { git: https://github.com/pulp-platform/axi_riscv_atomics, version: 0.6.0 }
common_cells: { git: https://github.com/pulp-platform/common_cells, version: 1.28.0 }
FPnew: { git: https://github.com/openhwgroup/cvfpu, rev: 1202ca3 } # TODO: feature branch `feature/expanding_sdotp`; get merged!
FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 }
register_interface: { git: https://github.com/pulp-platform/register_interface, version: 0.4.2 }
tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version: 0.2.11 }
riscv-dbg: { git: https://github.com/pulp-platform/riscv-dbg, version: 0.8.0 }
Expand Down
1 change: 1 addition & 0 deletions hw/snitch_cluster/src/snitch_cc.sv
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,7 @@ module snitch_cc #(
.trace_port_o ( fpu_trace ),
.sequencer_tracer_port_o ( fpu_sequencer_trace ),
// pragma translate_on
.hart_id_i ( hart_id_i ),
.acc_req_i ( acc_snitch_req ),
.acc_req_valid_i ( acc_qvalid ),
.acc_req_ready_o ( acc_qready ),
Expand Down
2 changes: 2 additions & 0 deletions hw/snitch_cluster/src/snitch_fp_ss.sv
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
output fpu_trace_port_t trace_port_o,
output fpu_sequencer_trace_port_t sequencer_tracer_port_o,
// pragma translate_on
input logic [31:0] hart_id_i,
// Accelerator Interface - Slave
input acc_req_t acc_req_i,
input logic acc_req_valid_i,
Expand Down Expand Up @@ -2509,6 +2510,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
) i_fpu (
.clk_i ,
.rst_ni ( ~rst_i ),
.hart_id_i ( hart_id_i ),
.operands_i ( op ),
.rnd_mode_i ( fpu_rnd_mode ),
.op_i ( fpu_op ),
Expand Down
11 changes: 8 additions & 3 deletions hw/snitch_cluster/src/snitch_fpu.sv
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ module snitch_fpu import snitch_pkg::*; #(
input logic clk_i,
input logic rst_ni,
// Input signals
input logic [31:0] hart_id_i,
input logic [2:0][FLEN-1:0] operands_i,
input fpnew_pkg::roundmode_e rnd_mode_i,
input fpnew_pkg::operation_e op_i,
Expand Down Expand Up @@ -99,12 +100,15 @@ module snitch_fpu import snitch_pkg::*; #(

fpnew_top #(
// FPU configuration
.Features ( FPUFeatures ),
.Implementation ( FPUImplementation ),
.TagType ( logic[6:0] )
.Features ( FPUFeatures ),
.Implementation ( FPUImplementation ),
.TagType ( logic[6:0] ),
.CompressedVecCmpResult ( 1 ),
.StochasticRndImplementation ( fpnew_pkg::DEFAULT_RSR )
) i_fpu (
.clk_i ,
.rst_ni ,
.hart_id_i ( hart_id_i ),
.operands_i ( fpu_in_q.operands ),
.rnd_mode_i ( fpu_in_q.rnd_mode ),
.op_i ( fpu_in_q.op ),
Expand All @@ -114,6 +118,7 @@ module snitch_fpu import snitch_pkg::*; #(
.int_fmt_i ( fpu_in_q.int_fmt ),
.vectorial_op_i ( fpu_in_q.vectorial_op ),
.tag_i ( fpu_in_q.tag ),
.simd_mask_i ( '1 ),
.in_valid_i ( in_valid_q ),
.in_ready_o ( in_ready_q ),
.flush_i ( 1'b0 ),
Expand Down
6 changes: 3 additions & 3 deletions sw/blas/gemm/src/gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -929,9 +929,9 @@ void gemm(precision_t prec, uint32_t expand, uint32_t setup_ssr,
}
break;
case FP8:
gemm_fp8_ex_opt(frac_m, n, k, (char*)a + offsetA, lda, (char*)b,
ldb, (char*)c + offsetC, ldc_strided, &beta,
setup_ssr);
gemm_fp8_ex_opt(frac_m, n, k, (char*)a + offsetA, lda_strided,
(char*)b, ldb, (char*)c + offsetC, ldc_strided,
&beta, setup_ssr);
break;
}
}
20 changes: 11 additions & 9 deletions sw/blas/gemm/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ int main() {
uint32_t start_cycle = snrt_mcycle();

volatile uint32_t lda = K;
volatile uint32_t ldb = N;
volatile uint32_t ldb = K;
volatile uint32_t ldc = N;

// Transpose of A unsopported
if (TA) return -1;
if (TB) {
if (!TB) {
// Transpose of B supported only in FP64
if (dtype_size != FP64) return -1;
ldb = K;
ldb = N;
}

gemm(dtype_size, expand, setup_ssr, TA, TB, frac_m, N, K, 1, local_a,
Expand All @@ -75,6 +75,8 @@ int main() {
snrt_dma_wait_all();
}

snrt_cluster_hw_barrier();

// TODO: currently only works for single cluster otherwise need to
// synchronize all cores here
#ifdef BIST
Expand All @@ -86,19 +88,19 @@ int main() {
uint32_t idx = m * N + n;
switch (dtype_size) {
case FP64:
if (fabs(result[idx] - ((double *)local_c)[idx]) >
0.001)
if (fabs(result[idx] - ((double *)local_c)[idx]) <
fabs(result[idx] * 0.00001))
errors--;
break;
case FP32:
if (fabs(result[idx] - ((float *)local_c)[idx]) > 0.001)
if (fabs(result[idx] - ((float *)local_c)[idx]) <
fabs(result[idx] * 0.0001))
errors--;
break;
case FP16:
if (fabs(result[idx] - ((__fp16 *)local_c)[idx]) >
0.001)
if (fabs(result[idx] - ((__fp16 *)local_c)[idx]) <
fabs(result[idx] * 0.005))
errors--;
break;
case FP8:
printf("No golden model yet for fp8!\n");
return -1;
Expand Down
4 changes: 2 additions & 2 deletions target/snitch_cluster/cfg/default.hjson
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
lat_comp_fp8: 1,
lat_comp_fp8_alt: 1,
lat_noncomp: 1,
lat_conv: 1,
lat_sdotp: 2,
lat_conv: 2,
lat_sdotp: 3,
fpu_pipe_config: "BEFORE"
narrow_xbar_latency: "CUT_ALL_PORTS",
wide_xbar_latency: "CUT_ALL_PORTS",
Expand Down
2 changes: 1 addition & 1 deletion target/snitch_cluster/sw/apps/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ $(ELF): $(SRCS) $(DEP) $(LIBS) | $(BUILDDIR)
$(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@

$(DUMP): $(ELF) | $(BUILDDIR)
$(RISCV_OBJDUMP) -D $< > $@
$(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) $< > $@

$(DWARF): $(ELF) | $(BUILDDIR)
$(RISCV_DWARFDUMP) $< > $@
Expand Down
4 changes: 4 additions & 0 deletions target/snitch_cluster/sw/toolchain.mk
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,7 @@ RISCV_LDFLAGS += -lclang_rt.builtins-riscv32

# Archiver flags
RISCV_ARFLAGS = rcs

# Objdump flags
RISCV_OBJDUMP_FLAGS += --mcpu=snitch
RISCV_OBJDUMP_FLAGS += -D

0 comments on commit 3c73ab9

Please sign in to comment.