From 3470cd17579ce2c0b3160785fdf3b8bfe7a70a83 Mon Sep 17 00:00:00 2001 From: Tim Fischer Date: Thu, 25 Jul 2024 18:56:02 +0200 Subject: [PATCH] hw: Clean up shuffle unit --- Bender.yml | 1 + hw/snitch/src/riscv_instr.sv | 8 +- hw/snitch/src/snitch.sv | 39 +++-- hw/snitch_cluster/src/snitch_fp_ss.sv | 149 ++++++++----------- hw/snitch_cluster/src/snitch_shuffle_unit.sv | 112 ++++++++++++++ 5 files changed, 205 insertions(+), 104 deletions(-) create mode 100644 hw/snitch_cluster/src/snitch_shuffle_unit.sv diff --git a/Bender.yml b/Bender.yml index bfea9e6a45..a438211158 100644 --- a/Bender.yml +++ b/Bender.yml @@ -157,6 +157,7 @@ sources: - hw/snitch_cluster/src/snitch_cluster_peripheral/snitch_cluster_peripheral_reg_top.sv - hw/snitch_cluster/src/snitch_cluster_peripheral/snitch_cluster_peripheral.sv - hw/snitch_cluster/src/snitch_fpu.sv + - hw/snitch_cluster/src/snitch_shuffle_unit.sv - hw/snitch_cluster/src/snitch_sequencer.sv - hw/snitch_cluster/src/snitch_tcdm_interconnect.sv # Level 1 diff --git a/hw/snitch/src/riscv_instr.sv b/hw/snitch/src/riscv_instr.sv index 565aef730a..4ae45968d5 100644 --- a/hw/snitch/src/riscv_instr.sv +++ b/hw/snitch/src/riscv_instr.sv @@ -327,7 +327,6 @@ package riscv_instr; localparam logic [31:0] DMSTAT = 32'b0000101?????00000000?????0101011; localparam logic [31:0] DMSTR = 32'b0000110??????????000000000101011; localparam logic [31:0] DMREP = 32'b000011100000?????000000000101011; - localparam logic [31:0] DMMCAST = 32'b000100000000?????000000000101011; localparam logic [31:0] FREP_O = 32'b????????????????????????10001011; localparam logic [31:0] FREP_I = 32'b????????????????????????00001011; localparam logic [31:0] IREP = 32'b?????????????????????????0111111; @@ -841,8 +840,11 @@ package riscv_instr; localparam logic [31:0] VL4R_V = 32'b000011101000?????110?????0000111; localparam logic [31:0] VL8R_V = 32'b000111101000?????111?????0000111; localparam logic [31:0] VFSHUFFLE_S = 32'b1011111??????????000?????0110011; - localparam logic [31:0] VFSHUFFLE_H = 32'b1011111??????????010?????0110011; - localparam logic [31:0] VFSHUFFLE_B = 32'b1011111??????????011?????0110011; + localparam logic [31:0] VFSHUFFLE_H = 32'b1011111??????????001?????0110011; + localparam logic [31:0] VFSHUFFLE_B = 32'b1011111??????????010?????0110011; + localparam logic [31:0] VFSHUFFLE2_S = 32'b1011111??????????100?????0110011; + localparam logic [31:0] VFSHUFFLE2_H = 32'b1011111??????????101?????0110011; + localparam logic [31:0] VFSHUFFLE2_B = 32'b1011111??????????110?????0110011; localparam logic [31:0] IMV_X_W = 32'b111000000000?????000?????1011011; localparam logic [31:0] IMV_W_X = 32'b111100000000?????000?????1011011; localparam logic [31:0] IADDI = 32'b?????????????????000?????1111011; diff --git a/hw/snitch/src/snitch.sv b/hw/snitch/src/snitch.sv index 2bf34ad35f..936172ab5d 100644 --- a/hw/snitch/src/snitch.sv +++ b/hw/snitch/src/snitch.sv @@ -1601,18 +1601,6 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( illegal_inst = 1'b1; end end - VFSHUFFLE_S, - VFSHUFFLE_H, - VFSHUFFLE_B: begin - if (FP_EN && XFVEC && RVF) begin - opb_select = Reg; - write_rd = 1'b0; - acc_qvalid_o = valid_instr; - shuffle = 1'b1; - end else begin - illegal_inst = 1'b1; - end - end VFCVT_S_B, VFCVTU_S_B: begin if (FP_EN && XFVEC && RVF && FLEN >= 16) begin @@ -1721,6 +1709,33 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( illegal_inst = 1'b1; end end + VFSHUFFLE_S, + VFSHUFFLE2_S: begin + if (FP_EN && XFVEC && FLEN >= 64) begin + write_rd = 1'b0; + acc_qvalid_o = valid_instr; + end else begin + illegal_inst = 1'b1; + end + end + VFSHUFFLE_H, + VFSHUFFLE2_H: begin + if (FP_EN && XFVEC && FLEN >= 32) begin + write_rd = 1'b0; + acc_qvalid_o = valid_instr; + end else begin + illegal_inst = 1'b1; + end + end + VFSHUFFLE_B, + VFSHUFFLE2_B: begin + if (FP_EN && XFVEC && FLEN >= 16) begin + write_rd = 1'b0; + acc_qvalid_o = valid_instr; + end else begin + illegal_inst = 1'b1; + end + end // Offload FP-Int Instructions - fire and forget // Double Precision Floating-Point FLE_D, diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv index b2a9fc7d8e..be2f214398 100644 --- a/hw/snitch_cluster/src/snitch_fp_ss.sv +++ b/hw/snitch_cluster/src/snitch_fp_ss.sv @@ -128,8 +128,9 @@ module snitch_fp_ss import snitch_pkg::*; #( // Shuffle Unit logic use_shfl; logic shfl_in_valid, shfl_in_ready; - logic shfl_valid; - logic shfl_in_ssr; + logic shfl_out_valid, shfl_out_ready; + logic [FLEN-1:0] shfl_result; + tag_t shfl_tag_in, shfl_tag_out; // FPU Controller logic fpu_out_valid, fpu_out_ready; @@ -246,7 +247,7 @@ module snitch_fp_ss import snitch_pkg::*; #( // 2. The LSU request can be handled // 3. The regfile operand is ready // 4. The Shuffle Unit and all operands are ready - assign shfl_in_valid = use_shfl & acc_req_valid_q & &(op_ready) & dst_ready; + assign shfl_in_valid = use_shfl & (&op_ready) & dst_ready; assign fpu_in_valid = use_fpu & acc_req_valid_q & (&op_ready) & dst_ready; // FPU ready assign acc_req_ready_q = dst_ready & ((fpu_in_ready & fpu_in_valid) @@ -259,7 +260,7 @@ module snitch_fp_ss import snitch_pkg::*; #( | (shfl_in_ready & shfl_in_valid)); // Shuffle Unit is ready to compute when Write Port is ready for shuffle result - assign shfl_in_ready = (!(acc_req_valid_q && result_select == ResAccBus) + assign shfl_out_ready = (!(acc_req_valid_q && result_select == ResAccBus) & !(fpu_out_valid && !fpu_tag_out.acc) & !lsu_pvalid); // either the FPU or the regfile produced a result @@ -329,13 +330,13 @@ module snitch_fp_ss import snitch_pkg::*; #( fpu_tag_in.rd = rd; fpu_tag_in.acc = 1'b0; // RD is on accelerator bus fpu_tag_in.ssr = ssr_active_q & is_rd_ssr; + shfl_tag_in = fpu_tag_in; is_store = 1'b0; is_load = 1'b0; ls_size = Word; use_shfl = 1'b0; - shfl_in_ssr = ssr_active_q & is_rd_ssr; // Destination register is in FPR rd_is_fp = 1'b1; @@ -548,15 +549,24 @@ module snitch_fp_ss import snitch_pkg::*; #( end riscv_instr::VFSHUFFLE_S: begin op_select[0] = RegA; - op_select[1] = AccBus; + op_select[1] = RegB; + src_fmt = fpnew_pkg::FP32; + dst_fmt = fpnew_pkg::FP32; + vectorial_op = 1'b1; + use_fpu = 1'b0; + use_shfl = 1'b1; + end + riscv_instr::VFSHUFFLE2_S: begin + op_select[0] = RegA; + op_select[1] = RegB; op_select[2] = RegDest; src_fmt = fpnew_pkg::FP32; dst_fmt = fpnew_pkg::FP32; vectorial_op = 1'b1; - set_dyn_rm = 1'b1; // fix round mode for vectors and fp16alt use_fpu = 1'b0; use_shfl = 1'b1; - end + op_mode = 1'b1; + end // Double Precision riscv_instr::FADD_D: begin fpu_op = fpnew_pkg::ADD; @@ -1127,14 +1137,23 @@ module snitch_fp_ss import snitch_pkg::*; #( end riscv_instr::VFSHUFFLE_H: begin op_select[0] = RegA; - op_select[1] = AccBus; + op_select[1] = RegB; + src_fmt = fpnew_pkg::FP16; + dst_fmt = fpnew_pkg::FP16; + vectorial_op = 1'b1; + use_fpu = 1'b0; + use_shfl = 1'b1; + end + riscv_instr::VFSHUFFLE2_H: begin + op_select[0] = RegA; + op_select[1] = RegB; op_select[2] = RegDest; src_fmt = fpnew_pkg::FP16; dst_fmt = fpnew_pkg::FP16; vectorial_op = 1'b1; - set_dyn_rm = 1'b1; // fix round mode for vectors and fp16alt use_fpu = 1'b0; use_shfl = 1'b1; + op_mode = 1'b1; end // [Alternate] Quarter Precision riscv_instr::FADD_B: begin @@ -1658,14 +1677,23 @@ module snitch_fp_ss import snitch_pkg::*; #( end riscv_instr::VFSHUFFLE_B: begin op_select[0] = RegA; - op_select[1] = AccBus; + op_select[1] = RegB; + src_fmt = fpnew_pkg::FP8; + dst_fmt = fpnew_pkg::FP8; + vectorial_op = 1'b1; + use_fpu = 1'b0; + use_shfl = 1'b1; + end + riscv_instr::VFSHUFFLE2_B: begin + op_select[0] = RegA; + op_select[1] = RegB; op_select[2] = RegDest; src_fmt = fpnew_pkg::FP8; dst_fmt = fpnew_pkg::FP8; vectorial_op = 1'b1; - set_dyn_rm = 1'b1; // fix round mode for vectors and fp16alt use_fpu = 1'b0; use_shfl = 1'b1; + op_mode = 1'b1; end // ------------------- // From float to int @@ -2483,81 +2511,24 @@ module snitch_fp_ss import snitch_pkg::*; #( // Shuffle Unit // ---------------------- - logic [FLEN-1:0] shfl_result; - logic [7:0] vec_mask; - logic [7:0][2:0] element_mask; - logic [31:0] num_elements; - - logic [(FLEN/32)-1:0][31:0] rA_32, rD_32, rA_op_32, rD_op_32; - logic [(FLEN/16)-1:0][15:0] rA_16, rD_16, rA_op_16, rD_op_16; - logic [(FLEN/8)-1:0][7:0] rA_8, rD_8, rA_op_8, rD_op_8; - - always_comb begin - shfl_valid = 1'b0; - shfl_result = '0; - vec_mask = '0; - element_mask = '0; - rA_32 = '0; - rD_32 = '0; - rA_16 = '0; - rD_16 = '0; - rA_8 = '0; - rD_8 = '0; - - if (shfl_in_valid & shfl_in_ready) begin - - for (int i = 0; i < 8; i++) begin - vec_mask[i] = op[1][(i*4)+3]; - element_mask[i] = op[1][(i*4) +: 3]; - end - - unique case (src_fmt) - fpnew_pkg::FP32: begin - num_elements = FLEN/32; - - rA_op_32 = op[0]; - rD_op_32 = op[2]; - - for (int i = 0; i < (num_elements); i++) begin - - rA_32[i] = rA_op_32[element_mask[i]]; - rD_32[i] = rD_op_32[element_mask[i]]; - - shfl_result[(i*32) +: 32] = vec_mask[i] ? rA_32[i] : rD_32[i]; - end - end - fpnew_pkg::FP16: begin - num_elements = FLEN/16; - - rA_op_16 = op[0]; - rD_op_16 = op[2]; - - for (int i = 0; i < (num_elements); i++) begin - rA_16[i] = rA_op_16[element_mask[i]]; - rD_16[i] = rD_op_16[element_mask[i]]; - - shfl_result[(i*16) +: 16] = vec_mask[i] ? rA_16[i] : rD_16[i]; - end - end - fpnew_pkg::FP8: begin - num_elements = FLEN/8; - - rA_op_8 = op[0]; - rD_op_8 = op[2]; - - for (int i = 0; i < (num_elements); i++) begin - - rA_8[i] = rA_op_8[element_mask[i]]; - rD_8[i] = rD_op_8[element_mask[i]]; - - shfl_result[(i*8) +: 8] = vec_mask[i] ? rA_8[i] : rD_8[i]; - end - end - endcase - - shfl_valid = 1'b1; - end - end + snitch_shuffle_unit #( + .XFVEC(XFVEC), + .FLEN (FLEN) + ) i_snitch_shuffle_unit ( + .clk_i, + .rst_ni ( ~rst_i ), + .operands_i (op), + .op_mod_i (op_mode), + .src_fmt_i (src_fmt), + .dst_fmt_i (dst_fmt), + .tag_i (shfl_tag_in), + .in_valid_i (shfl_in_valid), + .in_ready_o (shfl_in_ready), + .result_o (shfl_result), + .tag_o (shfl_tag_out), + .out_valid_o(shfl_out_valid), + .out_ready_i(shfl_out_ready) + ); // ---------------------- // Operand Select @@ -2718,9 +2689,9 @@ module snitch_fp_ss import snitch_pkg::*; #( fpr_waddr = lsu_rd; fpr_wvalid = 1'b1; fpr_wready = 1'b0; - end else if (shfl_valid) begin + end else if (shfl_out_valid) begin fpr_we = 1'b1; - if (shfl_in_ssr) begin + if (shfl_tag_out.ssr) begin ssr_wvalid_o = 1'b1; // stall write-back to SSR if (!ssr_wready_i) begin diff --git a/hw/snitch_cluster/src/snitch_shuffle_unit.sv b/hw/snitch_cluster/src/snitch_shuffle_unit.sv new file mode 100644 index 0000000000..dcf6c4b18b --- /dev/null +++ b/hw/snitch_cluster/src/snitch_shuffle_unit.sv @@ -0,0 +1,112 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Lucia Luzi +// Author: Tim Fischer + +module snitch_shuffle_unit #( + parameter bit XFVEC = 0, + parameter int unsigned FLEN = 0 +) ( + input logic clk_i, + input logic rst_ni, + // Input signals + input logic [2:0][FLEN-1:0] operands_i, + input logic op_mod_i, + input fpnew_pkg::fp_format_e src_fmt_i, + input fpnew_pkg::fp_format_e dst_fmt_i, + input logic [6:0] tag_i, + // Input Handshake + input logic in_valid_i, + output logic in_ready_o, + // Output signals + output logic [FLEN-1:0] result_o, + output logic [6:0] tag_o, + // Output handshake + output logic out_valid_o, + input logic out_ready_i +); + + // ---------------------- + // Mask + // ---------------------- + + logic [FLEN/8:0] vec_mask; + logic [FLEN/8:0][2:0] element_mask; + + for (genvar i = 0; i < FLEN/8; i++) begin + assign vec_mask[i] = operands_i[1][(i*4)+3]; + assign element_mask[i] = operands_i[1][(i*4) +: 3]; + end + + // ---------------------- + // Shuffle Unit + // ---------------------- + + logic [FLEN-1:0] result_f32, result_f16, result_f8; + + assign in_ready_o = out_ready_i; + assign tag_o = tag_i; + + if (XFVEC && FLEN >= 64) begin : gen_64 + logic [FLEN/32-1:0][31:0] op1_vec_f32, op2_vec_f32, op1_vec_sel_f32, op2_vec_sel_f32; + assign op1_vec_f32 = operands_i[0]; + assign op2_vec_f32 = operands_i[2]; + for (genvar i = 0; i < FLEN/32; i++) begin : gen_vec_sel + assign op1_vec_sel_f32[i] = op1_vec_f32[element_mask[i]]; + assign op2_vec_sel_f32[i] = op2_vec_f32[element_mask[i]]; + assign result_f32[(i*32) +: 32] = (vec_mask[i] & op_mod_i) ? op2_vec_sel_f32[i] : op1_vec_sel_f32[i]; + end + end else begin + assign result_f32 = '0; + end + if (XFVEC && FLEN >= 32) begin : gen_32 + logic [FLEN/16-1:0][15:0] op1_vec_f16, op2_vec_f16, op1_vec_sel_f16, op2_vec_sel_f16; + assign op1_vec_f16 = operands_i[0]; + assign op2_vec_f16 = operands_i[2]; + for (genvar i = 0; i < FLEN/16; i++) begin : gen_vec_sel + assign op1_vec_sel_f16[i] = op1_vec_f16[element_mask[i]]; + assign op2_vec_sel_f16[i] = op2_vec_f16[element_mask[i]]; + assign result_f16[(i*16) +: 16] = (vec_mask[i] & op_mod_i) ? op2_vec_sel_f16[i] : op1_vec_sel_f16[i]; + end + end else begin + assign result_f16 = '0; + end + if (XFVEC && FLEN >= 16) begin : gen_16 + logic [FLEN/8-1:0][7:0] op1_vec_f8, op2_vec_f8, op1_vec_sel_f8, op2_vec_sel_f8; + assign op1_vec_f8 = operands_i[0]; + assign op2_vec_f8 = operands_i[2]; + for (genvar i = 0; i < FLEN/8; i++) begin : gen_vec_sel + assign op1_vec_sel_f8[i] = op1_vec_f8[element_mask[i]]; + assign op2_vec_sel_f8[i] = op2_vec_f8[element_mask[i]]; + assign result_f8[(i*8) +: 8] = (vec_mask[i] & op_mod_i) ? op2_vec_sel_f8[i] : op1_vec_sel_f8[i]; + end + end else begin + assign result_f8 = '0; + end + + + always_comb begin + result_o = '0; + out_valid_o = in_valid_i; + if (in_valid_i & out_ready_i) begin + unique case (src_fmt_i) + fpnew_pkg::FP32: begin + result_o = result_f32; + end + fpnew_pkg::FP16, + fpnew_pkg::FP16ALT: begin + result_o = result_f16; + end + fpnew_pkg::FP8, + fpnew_pkg::FP8ALT: begin + result_o = result_f8; + end + default:; + endcase + out_valid_o = 1'b1; + end + end + +endmodule