Skip to content

Commit

Permalink
[hardware] Refactor the MASKU
Browse files Browse the repository at this point in the history
Comment: the lanes are not synchronized when sending operands.
Therefore, the spill regs need to handshake the lanes individually.
  • Loading branch information
mp-17 committed Nov 25, 2024
1 parent bd4c555 commit 86a84b7
Show file tree
Hide file tree
Showing 10 changed files with 1,362 additions and 973 deletions.
8 changes: 7 additions & 1 deletion hardware/include/ara_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,15 @@ package ara_pkg;
// Floating-point comparison instructions
VMFEQ, VMFLE, VMFLT, VMFNE, VMFGT, VMFGE,
// Integer comparison instructions
VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSBF, VMSOF, VMSIF, VIOTA, VID, VCPOP, VFIRST, VMSGT,
VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSGT,
// Integer add-with-carry and subtract-with-borrow carry-out instructions
VMADC, VMSBC,
// Mask to mask
VMSBF, VMSOF, VMSIF,
// Mask to non-mask
VIOTA, VID,
// Mask to scalar
VCPOP, VFIRST,
// Mask operations
VMANDNOT, VMAND, VMOR, VMXOR, VMORNOT, VMNAND, VMNOR, VMXNOR,
// Scalar moves from VRF
Expand Down
7 changes: 7 additions & 0 deletions hardware/include/rvv_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -161,4 +161,11 @@ package rvv_pkg;
// The mask register is always vreg[0]
localparam VMASK = 5'b00000;

/////////////////////////
// VLEN restrictions //
/////////////////////////

// RISC-V Maximum VLEN == 64Ki
localparam int unsigned RISCV_MAX_VLEN = 1 << 16;

endpackage : rvv_pkg
352 changes: 274 additions & 78 deletions hardware/src/ara_dispatcher.sv

Large diffs are not rendered by default.

218 changes: 123 additions & 95 deletions hardware/src/lane/lane_sequencer.sv

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions hardware/src/lane/operand_requester.sv
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
automatic elen_t vl_byte;
automatic elen_t vstart_byte;
automatic elen_t vector_body_len_byte;
automatic elen_t vector_body_len_packets;
automatic elen_t vector_body_len_elements;

// Bank we are currently requesting
automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0];
Expand Down Expand Up @@ -324,13 +324,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
? 0
: operand_request_i[requester_index].vstart << operand_request_i[requester_index].vtype.vsew;
vector_body_len_byte = vl_byte - vstart_byte + (vstart_byte % 8);
vector_body_len_packets = vector_body_len_byte >> operand_request_i[requester_index].eew;
if (vector_body_len_packets << operand_request_i[requester_index].eew < vector_body_len_byte)
vector_body_len_packets += 1;
vector_body_len_elements = vector_body_len_byte >> operand_request_i[requester_index].eew;
if (vector_body_len_elements << operand_request_i[requester_index].eew < vector_body_len_byte)
vector_body_len_elements += 1;

// Final computed length
effective_vector_body_length = (operand_request_i[requester_index].scale_vl)
? vector_body_len_packets
? vector_body_len_elements
: vector_body_length;

// Address of the vstart element of the vector in the VRF
Expand Down Expand Up @@ -401,7 +401,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
end : waw_counters_update

if (operand_queue_ready_i[requester_index]) begin
automatic vlen_t num_bytes;
automatic vlen_t num_elements;

// Operand request
lane_operand_req_transposed[requester_index][bank] = !stall;
Expand All @@ -417,12 +417,12 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
requester_metadata_d.addr = requester_metadata_q.addr + 1'b1;

// We read less than 64 bits worth of elements
num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
if (requester_metadata_q.len < num_bytes) begin
num_elements = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
if (requester_metadata_q.len < num_elements) begin
requester_metadata_d.len = 0;
end
else begin
requester_metadata_d.len = requester_metadata_q.len - num_bytes;
requester_metadata_d.len = requester_metadata_q.len - num_elements;
end
end : op_req_grant

Expand Down
7 changes: 2 additions & 5 deletions hardware/src/lane/simd_alu.sv
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,8 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
VMXOR : res = operand_a_i ^ operand_b_i;
VMXNOR : res = ~(operand_a_i ^ operand_b_i);

// vmsbf, vmsof, vmsif and viota operand generation
VMSBF, VMSOF, VMSIF, VIOTA : res = opb;

// Vector count population and find first set bit instructions
VCPOP, VFIRST : res = operand_b_i;
// Mask operands pass-through
VCPOP, VFIRST, VMSBF, VMSOF, VMSIF, VIOTA: res = operand_b_i;

// Arithmetic instructions
VSADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)
Expand Down
119 changes: 47 additions & 72 deletions hardware/src/lane/valu.sv
Original file line number Diff line number Diff line change
Expand Up @@ -175,22 +175,25 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
// Mask operands //
/////////////////////

logic mask_operand_valid;
logic mask_operand_ready;
logic mask_operand_gnt;

assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q];
assign mask_operand_valid = result_queue_q[result_queue_read_pnt_q].mask
& result_queue_valid_q[result_queue_read_pnt_q];
assign mask_operand_gnt = mask_operand_valid & mask_operand_ready;

spill_register #(
.T(elen_t)
) i_mask_operand_register (
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.data_o (mask_operand_o ),
.valid_o (mask_operand_valid_o ),
.ready_i (mask_operand_ready_i ),
.data_i (result_queue_q[result_queue_read_pnt_q].wdata ),
.valid_i (result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]),
.ready_o (mask_operand_ready )
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.data_o (mask_operand_o ),
.valid_o (mask_operand_valid_o ),
.ready_i (mask_operand_ready_i ),
.data_i (result_queue_q[result_queue_read_pnt_q].wdata ),
.valid_i (mask_operand_valid ),
.ready_o (mask_operand_ready )
);

//////////////////////
Expand Down Expand Up @@ -395,6 +398,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
// Remaining elements of the current instruction in the commit phase
vlen_t commit_cnt_d, commit_cnt_q;

// How many elements are issued/committed
logic [3:0] element_cnt_buf_issue, element_cnt_buf_commit;
logic [6:0] element_cnt_issue;
logic [6:0] element_cnt_commit;

always_comb begin: p_valu
// Maintain state
vinsn_queue_d = vinsn_queue_q;
Expand Down Expand Up @@ -436,6 +444,13 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
// Don't prevent commit by default
prevent_commit = 1'b0;

// How many elements are we processing this cycle?
element_cnt_buf_issue = 1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew));
element_cnt_issue = vinsn_issue_q.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_issue};

element_cnt_buf_commit = 1 << (unsigned'(EW64) - unsigned'(vinsn_commit.vtype.vsew));
element_cnt_commit = vinsn_commit.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_commit};

////////////////////////////////////////
// Write data into the result queue //
////////////////////////////////////////
Expand All @@ -450,7 +465,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
(alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) &&
(mask_valid_i || vinsn_issue_q.vm)) begin
// How many elements are we committing with this word?
automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
automatic logic [6:0] element_cnt = element_cnt_issue;

if (element_cnt > issue_cnt_q)
element_cnt = issue_cnt_q;
Expand Down Expand Up @@ -524,16 +539,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;

// Assign vector length for next instruction in the instruction queue
if (vinsn_queue_d.issue_cnt != 0) begin
if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
else begin
$warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
end
end
if (vinsn_queue_d.issue_cnt != 0)
issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
end
end
end
Expand All @@ -550,7 +557,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
(alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) &&
(mask_valid_i || vinsn_issue_q.vm)) begin
// How many elements are we committing with this word?
automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
automatic logic [6:0] element_cnt = element_cnt_issue;

if (element_cnt > issue_cnt_q)
element_cnt = issue_cnt_q;

Expand Down Expand Up @@ -656,16 +664,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;

// Assign vector length for next instruction in the instruction queue
if (vinsn_queue_d.issue_cnt != 0) begin
if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
else begin
$warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
end
end
if (vinsn_queue_d.issue_cnt != 0)
issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;

// Give the done to the main sequencer
commit_cnt_d = '0;
Expand Down Expand Up @@ -693,16 +693,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;

// Assign vector length for next instruction in the instruction queue
if (vinsn_queue_d.issue_cnt != 0) begin
if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
else begin
$warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
end
end
if (vinsn_queue_d.issue_cnt != 0)
issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;

// Commit and give the done to the main sequencer
commit_cnt_d = '0;
Expand Down Expand Up @@ -739,7 +731,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
if (|result_queue_valid_q)
vxsat_flag_o = |(alu_vxsat_q & result_queue_q[result_queue_read_pnt_q].be);

// Received a grant from the VRF.
// Received a grant from the VRF or MASKU.
// Deactivate the request.
if (alu_result_gnt_i || mask_operand_gnt) begin
result_queue_valid_d[result_queue_read_pnt_q] = 1'b0;
Expand All @@ -754,9 +746,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;

// Decrement the counter of remaining vector elements waiting to be written
// Don't do it in case of a reduction
if (!is_reduction(vinsn_commit.op))
commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew));
if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0;
if (!is_reduction(vinsn_commit.op)) begin
automatic logic [6:0] element_cnt = element_cnt_commit;
commit_cnt_d = commit_cnt_q - element_cnt;
if (commit_cnt_q < element_cnt) commit_cnt_d = '0;
end
end

// Finished committing the results of a vector instruction
Expand All @@ -770,18 +764,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
else vinsn_queue_d.commit_pnt += 1;

// Update the commit counter for the next instruction
if (vinsn_queue_d.commit_cnt != '0) begin
if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]}))
commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;
else begin
// We are asking for bits, and we want at least one chunk of bits if
// vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew)
$warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >>
vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew;
commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0];
end
end
if (vinsn_queue_d.commit_cnt != '0)
commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;

// Initialize counters and alu state if needed by the next instruction
// After a reduction, the next instructions starts after the reduction commits
Expand All @@ -806,7 +790,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
// Do not wait for masks if, during a reduction, this lane is just a pass-through
// The only valid instructions here with vl == '0 are reductions
vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0);
// Instructions that execute in the mask unit will process the mask there directly
// VMADC/VMSBC requires mask bits in the ALU
vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VMXNOR]} && !(vfu_operation_i.op inside {[VMADC:VMSBC]})
? 1'b1
: vfu_operation_i.vm | (vfu_operation_i.vl == '0);

// Initialize counters and alu state if the instruction queue was empty
// and the lane is not reducing
Expand All @@ -822,22 +810,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
sldu_transactions_cnt_d = $clog2(NrLanes) + 1;

issue_cnt_d = vfu_operation_i.vl;
if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
issue_cnt_d = vfu_operation_i.vl;
else begin
issue_cnt_d = (vfu_operation_i.vl / 8) >>
vfu_operation_i.vtype.vsew;
issue_cnt_d += |vfu_operation_i.vl[2:0];
end
end
if (vinsn_queue_d.commit_cnt == '0)
if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
commit_cnt_d = vfu_operation_i.vl;
else begin
// Operations between mask vectors operate on bits
commit_cnt_d = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew;
commit_cnt_d += |vfu_operation_i.vl[2:0];
end
commit_cnt_d = vfu_operation_i.vl;

// Bump pointers and counters of the vector instruction queue
vinsn_queue_d.accept_pnt += 1;
Expand Down
13 changes: 9 additions & 4 deletions hardware/src/lane/vmfpu.sv
Original file line number Diff line number Diff line change
Expand Up @@ -1245,21 +1245,18 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
(vinsn_processing_q.op == VMFNE) ?
~vfpu_processed_result[16*b] :
vfpu_processed_result[16*b];
for (int b = 0; b < 4; b++) vfpu_processed_result[16*b+1] = vfpu_mask[2*b];
end
EW32: begin
for (int b = 0; b < 2; b++) vfpu_processed_result[32*b] =
(vinsn_processing_q.op == VMFNE) ?
~vfpu_processed_result[32*b] :
vfpu_processed_result[32*b];
for (int b = 0; b < 2; b++) vfpu_processed_result[32*b+1] = vfpu_mask[4*b];
end
EW64: begin
for (int b = 0; b < 1; b++) vfpu_processed_result[b] =
(vinsn_processing_q.op == VMFNE) ?
~vfpu_processed_result[b] :
vfpu_processed_result[b];
for (int b = 0; b < 1; b++) vfpu_processed_result[b+1] = vfpu_mask[8*b];
end
endcase
end
Expand Down Expand Up @@ -2180,7 +2177,15 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;

if (!vinsn_queue_full && vfu_operation_valid_i &&
(vfu_operation_i.vfu == VFU_MFpu || vfu_operation_i.op inside {[VMFEQ:VMFGE]})) begin
vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
// Masks are handled in the MASKU directly for comparisons
vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMFEQ:VMFGE]}
? 1'b1
: vfu_operation_i.vm;
// During comparisons, vd_op is for the masku, not for the VMFPU
vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].use_vd_op = vfu_operation_i.op inside {[VMFEQ:VMFGE]}
? 1'b0
: vfu_operation_i.use_vd_op;

// Initialize counters
if (vinsn_queue_d.issue_cnt == '0 && !prevent_commit) begin
Expand Down
Loading

0 comments on commit 86a84b7

Please sign in to comment.