Skip to content

Commit

Permalink
[hardware] WIP: add vrgather/vcompress support
Browse files Browse the repository at this point in the history
  • Loading branch information
mp-17 committed Nov 21, 2024
1 parent c5cc29e commit 171139b
Showing 1 changed file with 93 additions and 16 deletions.
109 changes: 93 additions & 16 deletions hardware/src/masku/masku.sv
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
logic [3:0] elm_per_lane; // From 0 to 8 elements per lane
logic [NrLanes-1:0] additional_elm; // There can be an additional element for some lanes
// BE signals for VIOTA
logic [NrLanes*DataWidth/8-1:0] be_viota_seq_d, be_viota_seq_q, be_viota_shuf;
logic [NrLanes*DataWidth/8-1:0] be_viota_seq_d, be_viota_seq_q, be_vrgat_seq_d, be_vrgat_seq_q,;
logic [NrLanes*DataWidth/8-1:0] be_masku_alu_shuf;

// Local Parameter VcpopParallelism and VfirstParallelism
//
Expand Down Expand Up @@ -675,6 +676,66 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
end
endcase
end
// VRGATHER, VCOMPRESS get elements from the vd operand queue (not to complicate the ALU control)
// Then, they just shuffle the operand in the correct place
// This operation writes vsew-bit elements with vtype.vsew encoding
// The vd source can have a different encoding (it gets deshuffled in the masku_operand stage)
[VRGATHER:VCOMPRESS]: begin
// Buffer for the current element
logic [NrLanes*DataWidth-1:0] vrgat_res;
// Buffer for the current element
logic [DataWidth-1:0] vrgat_buf;
// EEW of the current element
vsew_e vrgat_eew;
// Max 64Ki elements to count
logic [15:0] vrgat_cnt;
// Current index
logic [15:0] vrgat_idx;

// Extract the correct elements
vrgat_res = '1; // Default assignment
vrgat_buf = masku_operand_vd_seq[vrgat_idx[idx_width(NrLanes*ELENB/8)-1:0] * 64 +: 64]; // Default assignment
unique case (vinsn_issue.vtype.vsew)
EW8: begin
vrgat_buf[0 +: 8] = masku_operand_vd_seq[vrgat_idx[idx_width(NrLanes*ELENB/1)-1:0] * 8 +: 8];
vrgat_res[out_valid_cnt_q[idx_width(NrLanes*ELENB/1)-1:0] * 8 +: 8] = vrgat_buf[0 +: 8];
end
EW16: begin
vrgat_buf[0 +: 16] = masku_operand_vd_seq[vrgat_idx[idx_width(NrLanes*ELENB/2)-1:0] * 16 +: 16];
vrgat_res[out_valid_cnt_q[idx_width(NrLanes*ELENB/2)-1:0] * 16 +: 16] = vrgat_buf[0 +: 16];
end
EW32: begin
vrgat_buf[0 +: 32] = masku_operand_vd_seq[vrgat_idx[idx_width(NrLanes*ELENB/4)-1:0] * 32 +: 32];
vrgat_res[out_valid_cnt_q[idx_width(NrLanes*ELENB/4)-1:0] * 32 +: 32] = vrgat_buf[0 +: 32];
end
default: begin // EW64
vrgat_buf[0 +: 64] = masku_operand_vd_seq[vrgat_idx[idx_width(NrLanes*ELENB/8)-1:0] * 64 +: 64];
vrgat_res[out_valid_cnt_q[idx_width(NrLanes*ELENB/8)-1:0] * 64 +: 64] = vrgat_buf[0 +: 64];
end
endcase

// BE signal for VRGATHER
unique case (vinsn_issue.vtype.vsew)
EW8: begin
be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8)-1:0] * 1 +: 1] =
{1{vinsn_issue.vm}} | {1{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}};
end
EW16: begin
be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/16)-1:0] * 2 +: 2] =
{2{vinsn_issue.vm}} | {2{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}};
end
EW32: begin
be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/32)-1:0] * 4 +: 4] =
{4{vinsn_issue.vm}} | {4{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}};
end
default: begin // EW64
be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/64)-1:0] * 8 +: 8] =
{8{vinsn_issue.vm}} | {8{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}};
end
endcase

alu_result_vm_m = vrgat_res;
end
// VCPOP, VFIRST: mask the current slice and feed the popc or lzc unit
[VCPOP:VFIRST] : begin
vcpop_operand = (!vinsn_issue.vm) ? masku_operand_alu_seq & masku_operand_m_seq : masku_operand_alu_seq;
Expand All @@ -690,10 +751,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
end

// Shuffle the VIOTA, VID byte enable signal
be_viota_shuf = '0;
be_masku_alu_shuf = '0;
for (int b = 0; b < (NrLanes*StrbWidth); b++) begin
automatic int shuffle_byte = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew);
be_viota_shuf[shuffle_byte] = be_viota_seq_d[b];
be_masku_alu_shuf[shuffle_byte] = vinsn_issue.op == VRGATHER ? be_vrgat_seq_d[b] : be_viota_seq_d[b];
end

// Simplify layout handling
Expand Down Expand Up @@ -782,9 +843,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
out_valid_cnt_clr = 1'b0;
iteration_cnt_clr = 1'b0;

/////////////////////
// Mask Operands //
/////////////////////
////////////////////////////
// Predicated execution //
////////////////////////////

// Instructions that run in other units, but need mask strobes for predicated execution

Expand Down Expand Up @@ -863,10 +924,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
end
end

//////////////////////////////////////
// Send Mask Operands to the VFUs //
//////////////////////////////////////

// Send Mask Operands to the VFUs
for (int lane = 0; lane < NrLanes; lane++) begin: send_operand
mask_valid_o[lane] = mask_queue_valid_q[mask_queue_read_pnt_q][lane];
mask_o[lane] = mask_queue_q[mask_queue_read_pnt_q][lane];
Expand Down Expand Up @@ -909,6 +967,25 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
end
end

////////////////////////
// Index generation //
////////////////////////

// VRGATHER, VCOMPRESS require index generation and ad-hoc operand requesters
// The indices come from the VALU, while the operands will pass through the Vd operand queue (MaskB)
// This implementation is simple and unoptimized:
// We ask all the lanes in parallel for a precise index, and we will get a balanced payload from them.
// Only one element of the payload is important, the rest is discarded.
// This can be easily optimized by asking only the correct lane and by handling unbalanced payloads.
logic [NrLanes-1:0] vrgat_idx_req;
logic vrgat_idx_addr;

// VRGATHER


// VCOMPRESS


///////////////////////
// MASKU ALU Control //
///////////////////////
Expand All @@ -935,7 +1012,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata, // Retain the last-cycle's data
// VIOTA, VID generate a non-mask vector and should comply with undisturbed policy
// This means that we can use the byte-enable signal
be : vinsn_issue.op inside {[VIOTA:VID]} ? be(elm_per_lane + additional_elm[lane], vinsn_issue.vtype.vsew) & be_viota_shuf[lane*StrbWidth +: StrbWidth] : '1,
be : vinsn_issue.op inside {[VIOTA:VID]} ? be(elm_per_lane + additional_elm[lane], vinsn_issue.vtype.vsew) & be_masku_alu_shuf[lane*StrbWidth +: StrbWidth] : '1,
addr : vaddr(vinsn_issue.vd, NrLanes, VLEN) + iteration_cnt_q,
id : vinsn_issue.id
};
Expand All @@ -944,7 +1021,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
// Is there an instruction ready to be issued?
if (vinsn_issue_valid && vinsn_issue.op inside {[VMFEQ:VMXNOR]}) begin
// Compute one slice if we can write and the necessary inputs are valid
if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op == VID)
// VID does not require any operand, while VRGATHER/VCOMPRESS's ALU operand is just preprocessed to get the indices.
// Therefore, VRGATHER/VCOMPRESS's ALU timing should follow the operands only, passed through Vd
if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op inside {VID,VRGATHER,VCOMPRESS})
&& (&masku_operand_vd_valid || !vinsn_issue.use_vd_op)
&& (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {[VMADC:VMSBC]})) begin

Expand Down Expand Up @@ -1026,10 +1105,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
end
end

/////////////////////
// Write results //
/////////////////////

// Write VRF words to the result queue
if (out_vrf_word_valid) begin
// Write to the lanes
Expand Down Expand Up @@ -1307,6 +1382,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
viota_acc_q <= '0;
found_one_q <= '0;
be_viota_seq_q <= '1; // Default: write
be_vrgat_seq_q <= '1; // Default: write
end else begin
vinsn_running_q <= vinsn_running_d;
read_cnt_q <= read_cnt_d;
Expand All @@ -1326,6 +1402,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
viota_acc_q <= viota_acc_d;
found_one_q <= found_one_d;
be_viota_seq_q <= be_viota_seq_d;
be_vrgat_seq_q <= be_vrgat_seq_d;
end
end

Expand Down

0 comments on commit 171139b

Please sign in to comment.