[hardware] Refactor the MASKU

Comment: the lanes are not synchronized when sending operands. Therefore, the spill regs need to handshake the lanes individually.
pulp-platform · Nov 25, 2024 · 86a84b7 · 86a84b7
1 parent bd4c555
commit 86a84b7
Show file tree

Hide file tree

Showing 10 changed files with 1,362 additions and 973 deletions.
diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
@@ -140,9 +140,15 @@ package ara_pkg;
     // Floating-point comparison instructions
     VMFEQ, VMFLE, VMFLT, VMFNE, VMFGT, VMFGE,
     // Integer comparison instructions
-    VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSBF, VMSOF, VMSIF, VIOTA, VID, VCPOP, VFIRST, VMSGT,
+    VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSGT,
     // Integer add-with-carry and subtract-with-borrow carry-out instructions
     VMADC, VMSBC,
+    // Mask to mask
+    VMSBF, VMSOF, VMSIF,
+    // Mask to non-mask
+    VIOTA, VID,
+    // Mask to scalar
+    VCPOP, VFIRST,
     // Mask operations
     VMANDNOT, VMAND, VMOR, VMXOR, VMORNOT, VMNAND, VMNOR, VMXNOR,
     // Scalar moves from VRF

diff --git a/hardware/include/rvv_pkg.sv b/hardware/include/rvv_pkg.sv
@@ -161,4 +161,11 @@ package rvv_pkg;
   // The mask register is always vreg[0]
   localparam VMASK = 5'b00000;
 
+  /////////////////////////
+  //  VLEN restrictions  //
+  /////////////////////////
+
+  // RISC-V Maximum VLEN == 64Ki
+  localparam int unsigned RISCV_MAX_VLEN = 1 << 16;
+
 endpackage : rvv_pkg
diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv
@@ -291,7 +291,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
       automatic elen_t vl_byte;
       automatic elen_t vstart_byte;
       automatic elen_t vector_body_len_byte;
-      automatic elen_t vector_body_len_packets;
+      automatic elen_t vector_body_len_elements;
 
       // Bank we are currently requesting
       automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0];
@@ -324,13 +324,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
                   ? 0
                   : operand_request_i[requester_index].vstart << operand_request_i[requester_index].vtype.vsew;
       vector_body_len_byte = vl_byte - vstart_byte + (vstart_byte % 8);
-      vector_body_len_packets = vector_body_len_byte >> operand_request_i[requester_index].eew;
-      if (vector_body_len_packets << operand_request_i[requester_index].eew < vector_body_len_byte)
-        vector_body_len_packets += 1;
+      vector_body_len_elements = vector_body_len_byte >> operand_request_i[requester_index].eew;
+      if (vector_body_len_elements << operand_request_i[requester_index].eew < vector_body_len_byte)
+        vector_body_len_elements += 1;
 
       // Final computed length
       effective_vector_body_length = (operand_request_i[requester_index].scale_vl)
-                                   ? vector_body_len_packets
+                                   ? vector_body_len_elements
                                    : vector_body_length;
 
       // Address of the vstart element of the vector in the VRF
@@ -401,7 +401,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
           end : waw_counters_update
 
           if (operand_queue_ready_i[requester_index]) begin
-            automatic vlen_t num_bytes;
+            automatic vlen_t num_elements;
 
             // Operand request
             lane_operand_req_transposed[requester_index][bank] = !stall;
@@ -417,12 +417,12 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
               requester_metadata_d.addr = requester_metadata_q.addr + 1'b1;
 
               // We read less than 64 bits worth of elements
-              num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
-              if (requester_metadata_q.len < num_bytes) begin
+              num_elements = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
+              if (requester_metadata_q.len < num_elements) begin
                 requester_metadata_d.len    = 0;
               end
               else begin
-                requester_metadata_d.len = requester_metadata_q.len - num_bytes;
+                requester_metadata_d.len = requester_metadata_q.len - num_elements;
               end
             end : op_req_grant
 

diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv
@@ -132,11 +132,8 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
         VMXOR   : res = operand_a_i ^ operand_b_i;
         VMXNOR  : res = ~(operand_a_i ^ operand_b_i);
 
-        // vmsbf, vmsof, vmsif and viota operand generation
-        VMSBF, VMSOF, VMSIF, VIOTA : res = opb;
-
-	      // Vector count population and find first set bit instructions
-        VCPOP, VFIRST : res = operand_b_i;
+        // Mask operands pass-through
+        VCPOP, VFIRST, VMSBF, VMSOF, VMSIF, VIOTA: res = operand_b_i;
 
         // Arithmetic instructions
         VSADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)

diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
@@ -175,22 +175,25 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
   //  Mask operands  //
   /////////////////////
 
+  logic mask_operand_valid;
   logic mask_operand_ready;
   logic mask_operand_gnt;
 
-  assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q];
+  assign mask_operand_valid = result_queue_q[result_queue_read_pnt_q].mask
+                            & result_queue_valid_q[result_queue_read_pnt_q];
+  assign mask_operand_gnt = mask_operand_valid & mask_operand_ready;
 
   spill_register #(
     .T(elen_t)
   ) i_mask_operand_register (
-    .clk_i     (clk_i                                                                                        ),
-    .rst_ni    (rst_ni                                                                                       ),
-    .data_o    (mask_operand_o                                                                               ),
-    .valid_o   (mask_operand_valid_o                                                                         ),
-    .ready_i   (mask_operand_ready_i                                                                         ),
-    .data_i    (result_queue_q[result_queue_read_pnt_q].wdata                                                ),
-    .valid_i   (result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]),
-    .ready_o   (mask_operand_ready                                                                           )
+    .clk_i     (clk_i                                         ),
+    .rst_ni    (rst_ni                                        ),
+    .data_o    (mask_operand_o                                ),
+    .valid_o   (mask_operand_valid_o                          ),
+    .ready_i   (mask_operand_ready_i                          ),
+    .data_i    (result_queue_q[result_queue_read_pnt_q].wdata ),
+    .valid_i   (mask_operand_valid                            ),
+    .ready_o   (mask_operand_ready                            )
   );
 
   //////////////////////
@@ -395,6 +398,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
   // Remaining elements of the current instruction in the commit phase
   vlen_t commit_cnt_d, commit_cnt_q;
 
+  // How many elements are issued/committed
+  logic [3:0] element_cnt_buf_issue, element_cnt_buf_commit;
+  logic [6:0] element_cnt_issue;
+  logic [6:0] element_cnt_commit;
+
   always_comb begin: p_valu
     // Maintain state
     vinsn_queue_d = vinsn_queue_q;
@@ -436,6 +444,13 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     // Don't prevent commit by default
     prevent_commit = 1'b0;
 
+    // How many elements are we processing this cycle?
+    element_cnt_buf_issue = 1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew));
+    element_cnt_issue = vinsn_issue_q.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_issue};
+
+    element_cnt_buf_commit = 1 << (unsigned'(EW64) - unsigned'(vinsn_commit.vtype.vsew));
+    element_cnt_commit = vinsn_commit.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_commit};
+
     ////////////////////////////////////////
     //  Write data into the result queue  //
     ////////////////////////////////////////
@@ -450,7 +465,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [6:0] element_cnt = element_cnt_issue;
 
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
@@ -524,16 +539,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                   vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
                 // Assign vector length for next instruction in the instruction queue
-                if (vinsn_queue_d.issue_cnt != 0) begin
-                  if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                    issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
-                  else begin
-                    $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-                    issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
-                      vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                    issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
-                  end
-                end
+                if (vinsn_queue_d.issue_cnt != 0)
+                  issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
               end
             end
           end
@@ -550,7 +557,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [6:0] element_cnt = element_cnt_issue;
+
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
 
@@ -656,16 +664,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
             vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
           // Assign vector length for next instruction in the instruction queue
-          if (vinsn_queue_d.issue_cnt != 0) begin
-            if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-              issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
-            else begin
-              $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-              issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
-                vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-              issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
-            end
-          end
+          if (vinsn_queue_d.issue_cnt != 0)
+            issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
 
           // Give the done to the main sequencer
           commit_cnt_d = '0;
@@ -693,16 +693,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
               // Assign vector length for next instruction in the instruction queue
-              if (vinsn_queue_d.issue_cnt != 0) begin
-                if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                  issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
-                else begin
-                  $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-                  issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
-                    vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                  issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
-                end
-              end
+              if (vinsn_queue_d.issue_cnt != 0)
+                issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
 
               // Commit and give the done to the main sequencer
               commit_cnt_d = '0;
@@ -739,7 +731,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     if (|result_queue_valid_q)
       vxsat_flag_o = |(alu_vxsat_q & result_queue_q[result_queue_read_pnt_q].be);
 
-    // Received a grant from the VRF.
+    // Received a grant from the VRF or MASKU.
     // Deactivate the request.
     if (alu_result_gnt_i || mask_operand_gnt) begin
       result_queue_valid_d[result_queue_read_pnt_q] = 1'b0;
@@ -754,9 +746,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
       // Decrement the counter of remaining vector elements waiting to be written
       // Don't do it in case of a reduction
-      if (!is_reduction(vinsn_commit.op))
-        commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew));
-      if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0;
+      if (!is_reduction(vinsn_commit.op)) begin
+        automatic logic [6:0] element_cnt = element_cnt_commit;
+          commit_cnt_d = commit_cnt_q - element_cnt;
+        if (commit_cnt_q < element_cnt) commit_cnt_d = '0;
+      end
     end
 
     // Finished committing the results of a vector instruction
@@ -770,18 +764,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       else vinsn_queue_d.commit_pnt += 1;
 
       // Update the commit counter for the next instruction
-      if (vinsn_queue_d.commit_cnt != '0) begin
-        if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;
-        else begin
-          // We are asking for bits, and we want at least one chunk of bits if
-          // vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew)
-          $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-          commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >>
-            vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew;
-          commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0];
-        end
-      end
+      if (vinsn_queue_d.commit_cnt != '0)
+        commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;
 
       // Initialize counters and alu state if needed by the next instruction
       // After a reduction, the next instructions starts after the reduction commits
@@ -806,7 +790,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
       // Do not wait for masks if, during a reduction, this lane is just a pass-through
       // The only valid instructions here with vl == '0 are reductions
-      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0);
+      // Instructions that execute in the mask unit will process the mask there directly
+      // VMADC/VMSBC requires mask bits in the ALU
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VMXNOR]} && !(vfu_operation_i.op inside {[VMADC:VMSBC]})
+                                                       ? 1'b1
+                                                       : vfu_operation_i.vm | (vfu_operation_i.vl == '0);
 
       // Initialize counters and alu state if the instruction queue was empty
       // and the lane is not reducing
@@ -822,22 +810,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
         sldu_transactions_cnt_d = $clog2(NrLanes) + 1;
 
         issue_cnt_d = vfu_operation_i.vl;
-        if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          issue_cnt_d = vfu_operation_i.vl;
-        else begin
-          issue_cnt_d = (vfu_operation_i.vl / 8) >>
-            vfu_operation_i.vtype.vsew;
-          issue_cnt_d += |vfu_operation_i.vl[2:0];
-        end
       end
       if (vinsn_queue_d.commit_cnt == '0)
-        if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vfu_operation_i.vl;
-        else begin
-          // Operations between mask vectors operate on bits
-          commit_cnt_d  = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew;
-          commit_cnt_d += |vfu_operation_i.vl[2:0];
-        end
+        commit_cnt_d = vfu_operation_i.vl;
 
       // Bump pointers and counters of the vector instruction queue
       vinsn_queue_d.accept_pnt += 1;

diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv
@@ -1245,21 +1245,18 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
               (vinsn_processing_q.op == VMFNE) ?
                 ~vfpu_processed_result[16*b] :
                 vfpu_processed_result[16*b];
-            for (int b = 0; b < 4; b++) vfpu_processed_result[16*b+1] = vfpu_mask[2*b];
           end
           EW32: begin
             for (int b = 0; b < 2; b++) vfpu_processed_result[32*b] =
               (vinsn_processing_q.op == VMFNE) ?
                 ~vfpu_processed_result[32*b] :
                 vfpu_processed_result[32*b];
-            for (int b = 0; b < 2; b++) vfpu_processed_result[32*b+1] = vfpu_mask[4*b];
           end
           EW64: begin
             for (int b = 0; b < 1; b++) vfpu_processed_result[b] =
               (vinsn_processing_q.op == VMFNE) ?
                 ~vfpu_processed_result[b] :
                 vfpu_processed_result[b];
-            for (int b = 0; b < 1; b++) vfpu_processed_result[b+1] = vfpu_mask[8*b];
           end
         endcase
       end
@@ -2180,7 +2177,15 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
     if (!vinsn_queue_full && vfu_operation_valid_i &&
       (vfu_operation_i.vfu == VFU_MFpu || vfu_operation_i.op inside {[VMFEQ:VMFGE]})) begin
-      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt]    = vfu_operation_i;
+      // Masks are handled in the MASKU directly for comparisons
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMFEQ:VMFGE]}
+                                                       ? 1'b1
+                                                       : vfu_operation_i.vm;
+      // During comparisons, vd_op is for the masku, not for the VMFPU
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].use_vd_op = vfu_operation_i.op inside {[VMFEQ:VMFGE]}
+                                                              ? 1'b0
+                                                              : vfu_operation_i.use_vd_op;
 
       // Initialize counters
       if (vinsn_queue_d.issue_cnt == '0 && !prevent_commit) begin