Skip to content

Commit

Permalink
superscalar add second ALU (#2303)
Browse files Browse the repository at this point in the history
  • Loading branch information
cathales authored Jun 30, 2024
1 parent aa5d7f8 commit 636e6af
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 10 deletions.
6 changes: 6 additions & 0 deletions core/cva6.sv
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,8 @@ module cva6
logic [CVA6Cfg.XLEN-1:0] fpu_result_ex_id;
logic fpu_valid_ex_id;
exception_t fpu_exception_ex_id;
// ALU2
logic [SUPERSCALAR:0] alu2_valid_id_ex;
// Accelerator
logic stall_acc_id;
scoreboard_entry_t issue_instr_id_acc;
Expand Down Expand Up @@ -755,6 +757,8 @@ module cva6
.fpu_valid_o (fpu_valid_id_ex),
.fpu_fmt_o (fpu_fmt_id_ex),
.fpu_rm_o (fpu_rm_id_ex),
// ALU2
.alu2_valid_o (alu2_valid_id_ex),
// CSR
.csr_valid_o (csr_valid_id_ex),
// CVXIF
Expand Down Expand Up @@ -864,6 +868,8 @@ module cva6
.fpu_result_o (fpu_result_ex_id),
.fpu_valid_o (fpu_valid_ex_id),
.fpu_exception_o (fpu_exception_ex_id),
// ALU2
.alu2_valid_i (alu2_valid_id_ex),
.amo_valid_commit_i (amo_valid_commit),
.amo_req_o (amo_req),
.amo_resp_i (amo_resp),
Expand Down
71 changes: 65 additions & 6 deletions core/ex_stage.sv
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ module ex_stage
output logic fpu_valid_o,
// FPU exception - ISSUE_STAGE
output exception_t fpu_exception_o,
// ALU2 instruction is valid - ISSUE_STAGE
input logic [SUPERSCALAR:0] alu2_valid_i,
// CVXIF instruction is valid - ISSUE_STAGE
input logic [SUPERSCALAR:0] x_valid_i,
// CVXIF is ready - ISSUE_STAGE
Expand Down Expand Up @@ -386,6 +388,12 @@ module ex_stage
// ----------------
// FPU
// ----------------
logic fpu_valid;
logic [CVA6Cfg.TRANS_ID_BITS-1:0] fpu_trans_id;
logic [CVA6Cfg.XLEN-1:0] fpu_result;
logic alu2_valid;
logic [CVA6Cfg.XLEN-1:0] alu2_result;

generate
if (CVA6Cfg.FpPresent) begin : fpu_gen
fu_data_t fpu_data;
Expand Down Expand Up @@ -413,20 +421,71 @@ module ex_stage
.fpu_rm_i,
.fpu_frm_i,
.fpu_prec_i,
.fpu_trans_id_o,
.result_o(fpu_result_o),
.fpu_valid_o,
.fpu_trans_id_o(fpu_trans_id),
.result_o(fpu_result),
.fpu_valid_o(fpu_valid),
.fpu_exception_o
);
end else begin : no_fpu_gen
assign fpu_ready_o = '0;
assign fpu_trans_id_o = '0;
assign fpu_result_o = '0;
assign fpu_valid_o = '0;
assign fpu_trans_id = '0;
assign fpu_result = '0;
assign fpu_valid = '0;
assign fpu_exception_o = '0;
end
endgenerate

// ----------------
// ALU2
// ----------------
fu_data_t alu2_data;
if (SUPERSCALAR) begin : alu2_gen
always_comb begin
alu2_data = alu2_valid_i[0] ? fu_data_i[0] : '0;
if (alu2_valid_i[1]) begin
alu2_data = fu_data_i[1];
end
end

alu #(
.CVA6Cfg (CVA6Cfg),
.fu_data_t(fu_data_t)
) alu2_i (
.clk_i,
.rst_ni,
.fu_data_i (alu2_data),
.result_o (alu2_result),
.alu_branch_res_o( /* this ALU does not handle branching */)
);
end else begin
assign alu2_data = '0;
assign alu2_result = '0;
end

// result MUX
// This is really explicit so that synthesis tools can elide unused signals
if (SUPERSCALAR) begin
if (CVA6Cfg.FpPresent) begin
assign fpu_valid_o = fpu_valid || |alu2_valid_i;
assign fpu_result_o = fpu_valid ? fpu_result : alu2_result;
assign fpu_trans_id_o = fpu_valid ? fpu_trans_id : alu2_data.trans_id;
end else begin
assign fpu_valid_o = |alu2_valid_i;
assign fpu_result_o = alu2_result;
assign fpu_trans_id_o = alu2_data.trans_id;
end
end else begin
if (CVA6Cfg.FpPresent) begin
assign fpu_valid_o = fpu_valid;
assign fpu_result_o = fpu_result;
assign fpu_trans_id_o = fpu_trans_id;
end else begin
assign fpu_valid_o = '0;
assign fpu_result_o = '0;
assign fpu_trans_id_o = '0;
end
end

// ----------------
// Load-Store Unit
// ----------------
Expand Down
53 changes: 49 additions & 4 deletions core/issue_read_operands.sv
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ module issue_read_operands
output logic [1:0] fpu_fmt_o,
// FPU rm field from isntruction - TO_BE_COMPLETED
output logic [2:0] fpu_rm_o,
// ALU output is valid - TO_BE_COMPLETED
output logic [SUPERSCALAR:0] alu2_valid_o,
// CSR result is valid - TO_BE_COMPLETED
output logic [SUPERSCALAR:0] csr_valid_o,
// CVXIF result is valid - TO_BE_COMPLETED
Expand All @@ -120,7 +122,7 @@ module issue_read_operands
localparam OPERANDS_PER_INSTR = CVA6Cfg.NrRgprPorts >> SUPERSCALAR;

typedef struct packed {
logic none, load, store, alu, ctrl_flow, mult, csr, fpu, fpu_vec, cvxif, accel;
logic none, load, store, alu, alu2, ctrl_flow, mult, csr, fpu, fpu_vec, cvxif, accel;
} fus_busy_t;

logic [SUPERSCALAR:0] stall;
Expand All @@ -140,6 +142,7 @@ module issue_read_operands
logic [ SUPERSCALAR:0] fpu_valid_q;
logic [ 1:0] fpu_fmt_q;
logic [ 2:0] fpu_rm_q;
logic [ SUPERSCALAR:0] alu2_valid_q;
logic [ SUPERSCALAR:0] lsu_valid_q;
logic [ SUPERSCALAR:0] csr_valid_q;
logic [ SUPERSCALAR:0] branch_valid_q;
Expand Down Expand Up @@ -171,6 +174,7 @@ module issue_read_operands
assign fpu_valid_o = fpu_valid_q;
assign fpu_fmt_o = fpu_fmt_q;
assign fpu_rm_o = fpu_rm_q;
assign alu2_valid_o = alu2_valid_q;
assign cvxif_valid_o = CVA6Cfg.CvxifEn ? cvxif_valid_q : '0;
assign cvxif_off_instr_o = CVA6Cfg.CvxifEn ? cvxif_off_instr_q : '0;
assign stall_issue_o = stall[0];
Expand Down Expand Up @@ -200,6 +204,7 @@ module issue_read_operands
if (CVA6Cfg.FpPresent && !fpu_ready_i) begin
fus_busy[0].fpu = 1'b1;
fus_busy[0].fpu_vec = 1'b1;
if (SUPERSCALAR) fus_busy[0].alu2 = 1'b1;
end

if (!lsu_ready_i) begin
Expand Down Expand Up @@ -238,7 +243,20 @@ module issue_read_operands
end
end
end
ALU, CSR: begin
ALU: begin
if (SUPERSCALAR && !fus_busy[0].alu2) begin
fus_busy[1].alu2 = 1'b1;
// TODO is there a minimum float execution time?
// If so we could issue FPU & ALU2 the same cycle
fus_busy[1].fpu = 1'b1;
fus_busy[1].fpu_vec = 1'b1;
end else begin
fus_busy[1].alu = 1'b1;
fus_busy[1].ctrl_flow = 1'b1;
fus_busy[1].csr = 1'b1;
end
end
CSR: begin
fus_busy[1].alu = 1'b1;
fus_busy[1].ctrl_flow = 1'b1;
fus_busy[1].csr = 1'b1;
Expand All @@ -263,7 +281,13 @@ module issue_read_operands
always_comb begin
unique case (issue_instr_i[i].fu)
NONE: fu_busy[i] = fus_busy[i].none;
ALU: fu_busy[i] = fus_busy[i].alu;
ALU: begin
if (SUPERSCALAR && !fus_busy[i].alu2) begin
fu_busy[i] = fus_busy[i].alu2;
end else begin
fu_busy[i] = fus_busy[i].alu;
end
end
CTRL_FLOW: fu_busy[i] = fus_busy[i].ctrl_flow;
CSR: fu_busy[i] = fus_busy[i].csr;
MULT: fu_busy[i] = fus_busy[i].mult;
Expand Down Expand Up @@ -450,6 +474,7 @@ module issue_read_operands
fpu_valid_q <= '0;
fpu_fmt_q <= '0;
fpu_rm_q <= '0;
alu2_valid_q <= '0;
csr_valid_q <= '0;
branch_valid_q <= '0;
end else begin
Expand All @@ -459,6 +484,7 @@ module issue_read_operands
fpu_valid_q <= '0;
fpu_fmt_q <= '0;
fpu_rm_q <= '0;
alu2_valid_q <= '0;
csr_valid_q <= '0;
branch_valid_q <= '0;
// Exception pass through:
Expand All @@ -468,7 +494,11 @@ module issue_read_operands
if (!issue_instr_i[i].ex.valid && issue_instr_valid_i[i] && issue_ack_o[i]) begin
case (issue_instr_i[i].fu)
ALU: begin
alu_valid_q[i] <= 1'b1;
if (SUPERSCALAR && !fus_busy[i].alu2) begin
alu2_valid_q[i] <= 1'b1;
end else begin
alu_valid_q[i] <= 1'b1;
end
end
CTRL_FLOW: begin
branch_valid_q[i] <= 1'b1;
Expand Down Expand Up @@ -503,6 +533,7 @@ module issue_read_operands
lsu_valid_q <= '0;
mult_valid_q <= '0;
fpu_valid_q <= '0;
alu2_valid_q <= '0;
csr_valid_q <= '0;
branch_valid_q <= '0;
end
Expand Down Expand Up @@ -780,6 +811,20 @@ module issue_read_operands
);
end

// FPU does not declare that it will return a result the subsequent cycle so
// it is not possible for issue stage to know when ALU2 can be used if there
// is an FPU. As there are discussions to change the FPU, I did not explore
// its architecture to create this "FPU returns next cycle" signal. Also, a
// "lookahead" optimization should be added to be performant with FPU: when
// issue port 2 is issuing to FPU, issue port 1 should issue to ALU1 instead
// of ALU2 so that FPU is not busy. However, if FPU has a minimum execution
// time of 2 cycles, it is possible to simply not raise fus_busy[1].alu2.
initial begin
assert (!(SUPERSCALAR && CVA6Cfg.FpPresent))
else
$fatal(1, "FPU is not yet supported in superscalar CVA6, see comments above this assertion.");
end

for (genvar i = 0; i <= SUPERSCALAR; i++) begin
assert property (@(posedge clk_i) (branch_valid_q) |-> (!$isunknown(
fu_data_q[i].operand_a
Expand Down
3 changes: 3 additions & 0 deletions core/issue_stage.sv
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ module issue_stage
output logic [1:0] fpu_fmt_o,
// FPU rm field - EX_STAGE
output logic [2:0] fpu_rm_o,
// ALU2 FU is valid - EX_STAGE
output logic [SUPERSCALAR:0] alu2_valid_o,
// CSR is valid - EX_STAGE
output logic [SUPERSCALAR:0] csr_valid_o,
// CVXIF FU is valid - EX_STAGE
Expand Down Expand Up @@ -231,6 +233,7 @@ module issue_stage
.rd_clobber_gpr_i (rd_clobber_gpr_sb_iro),
.rd_clobber_fpr_i (rd_clobber_fpr_sb_iro),
.alu_valid_o (alu_valid_o),
.alu2_valid_o (alu2_valid_o),
.branch_valid_o (branch_valid_o),
.csr_valid_o (csr_valid_o),
.cvxif_valid_o (x_issue_valid_o),
Expand Down

0 comments on commit 636e6af

Please sign in to comment.