diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2a1c8e11..bbe42be2f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -149,7 +149,7 @@ jobs: - name: Compile Verilator if: steps.tc-verilator-cache.outputs.cache-hit != 'true' run: | - sudo apt-get install flex libfl-dev + sudo apt-get install flex libfl-dev help2man make verilator - name: Tar Verilator run: tar -cvf tc-verilator.tar install/verilator diff --git a/CHANGELOG.md b/CHANGELOG.md index 64cdadbd4..79cde5334 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -135,6 +135,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - `core_id_i` added to the interface of the system - Clock-gate the system bank macros when not used (VRF, D$, I$) - Spill register on `sldu` input signals to better isolate the unit + - Clock-gate the unusued SIMD-int multipliers to save power ### Changed @@ -185,6 +186,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Simplify the reduction engine for both `valu` and `vmfpu`, to avoid spurious valid signals to the `sldu` - Fix commit for `dtc` installation (`spike` dependency) - Simplify the datapath of the slide unit. The `sldu` supports only powers of two, and cannot slide and reshuffle at the same time. Non-power-of-two slides are now handled with micro operations. + - Bump Verilator to v5.012 ## 2.2.0 - 2021-11-02 diff --git a/Makefile b/Makefile index 15c925e79..95c93fffc 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ LLVM_INSTALL_DIR ?= ${INSTALL_DIR}/riscv-llvm ISA_SIM_INSTALL_DIR ?= ${INSTALL_DIR}/riscv-isa-sim ISA_SIM_MOD_INSTALL_DIR ?= ${INSTALL_DIR}/riscv-isa-sim-mod VERIL_INSTALL_DIR ?= ${INSTALL_DIR}/verilator -VERIL_VERSION ?= v4.214 +VERIL_VERSION ?= v5.012 DTC_COMMIT ?= b6910bec11614980a21e46fbccc35934b671bd81 CMAKE ?= cmake diff --git a/hardware/Makefile b/hardware/Makefile index d85fd638c..0625fcc28 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -179,6 +179,8 @@ $(veril_library)/V$(veril_top): $(config_file) Makefile ../Bender.yml $(shell fi -Wno-UNSIGNED \ -Wno-WIDTH \ -Wno-WIDTHCONCAT \ + -Wno-ENUMVALUE \ + -Wno-COMBDLY \ --hierarchical \ tb/verilator/waiver.vlt \ --Mdir $(veril_library) \ diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index 36c76df21..c00844552 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -106,16 +106,18 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Do we have a vector instruction ready to be issued? vfu_operation_t vinsn_issue_d, vinsn_issue_q; - logic vinsn_issue_valid; + logic vinsn_issue_d_valid, vinsn_issue_q_valid; assign vinsn_issue_d = vinsn_queue_d.vinsn[vinsn_queue_d.issue_pnt]; - assign vinsn_issue_valid = (vinsn_queue_q.issue_cnt != '0); + assign vinsn_issue_d_valid = (vinsn_queue_d.issue_cnt != '0); + assign vinsn_issue_q_valid = (vinsn_queue_q.issue_cnt != '0); // Do we have a vector instruction being processed? vfu_operation_t vinsn_processing_d, vinsn_processing_q; - logic vinsn_processing_valid; - assign vinsn_processing_d = vinsn_queue_d.vinsn[vinsn_queue_d.processing_pnt]; - assign vinsn_processing_q = vinsn_queue_q.vinsn[vinsn_queue_q.processing_pnt]; - assign vinsn_processing_valid = (vinsn_queue_q.processing_cnt != '0); + logic vinsn_processing_d_valid, vinsn_processing_q_valid; + assign vinsn_processing_d = vinsn_queue_d.vinsn[vinsn_queue_d.processing_pnt]; + assign vinsn_processing_q = vinsn_queue_q.vinsn[vinsn_queue_q.processing_pnt]; + assign vinsn_processing_d_valid = (vinsn_queue_d.processing_cnt != '0); + assign vinsn_processing_q_valid = (vinsn_queue_q.processing_cnt != '0); // Do we have a vector instruction with results being committed? vfu_operation_t vinsn_commit; @@ -282,6 +284,18 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Multiplier // ////////////////// + // Clock-gate for the multipliers + logic clkgate_en_d, clkgate_en_q, clk_i_gated; + + tc_clk_gating i_simd_mul_manual_clk_gate ( + .clk_i (clk_i ), + .en_i (clkgate_en_q), + .test_en_i (1'b0 ), + .clk_o (clk_i_gated ) + ); + + assign clkgate_en_d = vinsn_processing_d_valid & (vinsn_processing_d.op inside {[VMUL:VSMUL]}); + elen_t [3:0] vmul_simd_result; logic [3:0] vmul_simd_in_valid; logic [3:0] vmul_simd_in_ready; @@ -318,17 +332,17 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; ~vmul_simd_in_valid[vinsn_issue_q.vtype.vsew]; `FFLARNC(vmul_simd_op_a_q, vinsn_issue_q.use_scalar_op ? scalar_op : mfpu_operand_i[0], - gate_ff_en, gate_ff_clr, '0, clk_i, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); `FFLARNC(vmul_simd_op_b_q, mfpu_operand_i[1], - gate_ff_en, gate_ff_clr, '0, clk_i, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); `FFLARNC(vmul_simd_op_c_q, mfpu_operand_i[2], - gate_ff_en, gate_ff_clr, '0, clk_i, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); `FFLARNC(vmul_simd_mask_q, mask_i, - gate_ff_en, gate_ff_clr, '0, clk_i, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); `FFLARNC(vmul_simd_op_q, vinsn_issue_q.op, - gate_ff_en, gate_ff_clr, ara_op_e'('0), clk_i, rst_ni); + gate_ff_en, gate_ff_clr, ara_op_e'('0), clk_i_gated, rst_ni); `FFLARNC(vmul_simd_in_valid_q, vmul_simd_in_valid, - gate_ff_en, gate_ff_clr, '0, clk_i, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); for (genvar i = 0; i < 4; i++) begin `ifdef GF22 @@ -398,7 +412,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; .NumPipeRegs (LatMultiplierEW64), .ElementWidth(EW64 ) ) i_simd_mul_ew64 ( - .clk_i (clk_i ), + .clk_i (clk_i_gated ), .rst_ni (rst_ni ), .operand_a_i(vmul_simd_op_a_q_gated[EW64] ), .operand_b_i(vmul_simd_op_b_q_gated[EW64] ), @@ -420,7 +434,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; .NumPipeRegs (LatMultiplierEW32), .ElementWidth(EW32 ) ) i_simd_mul_ew32 ( - .clk_i (clk_i ), + .clk_i (clk_i_gated ), .rst_ni (rst_ni ), .operand_a_i(vmul_simd_op_a_q_gated[EW32] ), .operand_b_i(vmul_simd_op_b_q_gated[EW32] ), @@ -442,7 +456,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; .NumPipeRegs (LatMultiplierEW16), .ElementWidth(EW16 ) ) i_simd_mul_ew16 ( - .clk_i (clk_i ), + .clk_i (clk_i_gated ), .rst_ni (rst_ni ), .operand_a_i(vmul_simd_op_a_q_gated[EW16] ), .operand_b_i(vmul_simd_op_b_q_gated[EW16] ), @@ -464,7 +478,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; .NumPipeRegs (LatMultiplierEW8), .ElementWidth(EW8 ) ) i_simd_mul_ew8 ( - .clk_i (clk_i ), + .clk_i (clk_i_gated ), .rst_ni (rst_ni ), .operand_a_i(vmul_simd_op_a_q_gated[EW8] ), .operand_b_i(vmul_simd_op_b_q_gated[EW8] ), @@ -493,7 +507,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Only one SIMD Multiplier receives the request vmul_simd_in_valid = '0; vmul_simd_in_valid[vinsn_issue_q.vtype.vsew] = vmul_in_valid; - vmul_in_ready = vmul_simd_in_ready[vinsn_issue_q.vtype.vsew]; + vmul_in_ready = clkgate_en_q & vmul_simd_in_ready[vinsn_issue_q.vtype.vsew]; // Saturation flag mfpu_vxsat_d = mfpu_vxsat[vinsn_processing_q.vtype.vsew]; @@ -1298,7 +1312,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // If we are about to issue an instruction while another one is processing, // issue only if the new instruction is slower than the previous one latency_problem_d = vinsn_issue_lat_d < vinsn_processing_lat_d; - latency_stall = vinsn_issue_valid & vinsn_processing_valid & latency_problem_q; + latency_stall = vinsn_issue_q_valid & vinsn_processing_q_valid & latency_problem_q; operand_a = (vinsn_issue_q.op == VFRDIV) ? scalar_op : mfpu_operand_i[1]; // vs2 operand_b = (vinsn_issue_q.use_scalar_op && vinsn_issue_q.op != VFRDIV) @@ -1377,7 +1391,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; endcase // Is there a vector instruction ready to be issued and do we have all the operands necessary for this instruction? - if (operands_valid && vinsn_issue_valid && !is_reduction(vinsn_issue_q.op) && issue_cnt_q != '0 && !latency_stall) begin + if (operands_valid && vinsn_issue_q_valid && !is_reduction(vinsn_issue_q.op) && issue_cnt_q != '0 && !latency_stall) begin // Valiudate the inputs of the correct unit vmul_in_valid = vinsn_issue_mul; vdiv_in_valid = vinsn_issue_div; @@ -1684,7 +1698,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Issue the micro-operations // ======================================================= - if (operands_valid && vinsn_issue_valid) begin + if (operands_valid && vinsn_issue_q_valid) begin // Validate the inputs of FPU vfpu_in_valid = 1'b1; @@ -1892,7 +1906,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; mfpu_red_valid_o = red_hs_synch_q; // Issue the uOp - if (operands_valid && vinsn_issue_valid && issue_cnt_q != '0) begin + if (operands_valid && vinsn_issue_q_valid && issue_cnt_q != '0) begin vfpu_in_valid = 1'b1; if (vfpu_in_ready) begin // The number of elements to be issued in one 64-bit data @@ -2081,7 +2095,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; if (mfpu_state_q == NO_REDUCTION) begin // Initialize counters and vmfpu state if needed by the next instruction // After a reduction, the next instructions starts after the reduction commits - if (is_reduction(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op) && (vinsn_queue_d.issue_cnt != '0)) begin + if (is_reduction(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op) && (vinsn_issue_d_valid)) begin // The next will be the first operation of this instruction // This information is useful for reduction operation first_op_d = 1'b1; @@ -2201,6 +2215,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; intra_op_rx_cnt_q <= '0; osum_issue_cnt_q <= '0; mfpu_vxsat_q <= '0; + clkgate_en_q <= 1'b0; end else begin issue_cnt_q <= issue_cnt_d; to_process_cnt_q <= to_process_cnt_d; @@ -2224,6 +2239,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; intra_op_rx_cnt_q <= intra_op_rx_cnt_d; osum_issue_cnt_q <= osum_issue_cnt_d; mfpu_vxsat_q <= mfpu_vxsat_d; + clkgate_en_q <= clkgate_en_d; end end diff --git a/toolchain/verilator b/toolchain/verilator index 8f090176a..7d2d32420 160000 --- a/toolchain/verilator +++ b/toolchain/verilator @@ -1 +1 @@ -Subproject commit 8f090176a68b66bbe0a6ce95294944bffda9fdf7 +Subproject commit 7d2d32420a630befa4097170ecbf227e04e32522