[hardware] Relax tight addrgen timing paths

pulp-platform · Oct 31, 2024 · d04f2c8 · d04f2c8
1 parent 4f9d3a9
commit d04f2c8
Showing 1 changed file with 74 additions and 73 deletions.
diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv
@@ -81,7 +81,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   import axi_pkg::CACHE_MODIFIABLE;
 
   // Check if the address is aligned to a particular width
-  function automatic logic is_addr_error(axi_addr_t addr, vew_e vew);
+  function automatic logic is_addr_error(axi_addr_t addr, logic [1:0] vew);
     is_addr_error = |(addr & (elen_t'(1 << vew) - 1));
   endfunction // is_addr_error
 
@@ -95,7 +95,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     axi_addr_t addr;
     vlen_t len;
     elen_t stride;
-    vew_e vew;
+    logic [1:0] vew; // Support only up to 64-bit
     logic is_load;
     logic is_burst; // Unit-strided instructions can be converted into AXI INCR bursts
     vlen_t vstart;
@@ -280,6 +280,32 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
       end
 
       ADDRGEN: begin
+        // NOTE: indexed are not covered here
+        automatic logic [riscv::VLEN-1:0] vaddr_start;
+
+        case (pe_req_q.op)
+          // Unit-stride: address = base + (vstart in elements)
+          VLE,  VSE : vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew));
+          // Strided: address = base + (vstart * stride)
+          // NOTE: this multiplier might cause some timing issues
+          VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart * pe_req_q.stride);
+          // Indexed: let the next stage take care of vstart
+          VLXE, VSXE: vaddr_start = pe_req_q.scalar_op;
+          default   : vaddr_start = '0;
+        endcase // pe_req_q.op
+
+        // Start the computation already
+        addrgen_req = '{
+          addr    : vaddr_start,
+          len     : (pe_req_q.vl - pe_req_q.vstart) << unsigned'(pe_req_q.vtype.vsew[1:0]),
+          stride  : pe_req_q.stride,
+          vew     : pe_req_q.vtype.vsew[1:0],
+          is_load : is_load(pe_req_q.op),
+          // Unit-strided loads/stores trigger incremental AXI bursts.
+          is_burst: (pe_req_q.op inside {VLE, VSE}),
+          vstart  : pe_req_q.vstart
+        };
+
         // Ara does not support misaligned AXI requests
         if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin
           state_d         = IDLE;
@@ -289,30 +315,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
           addrgen_exception_o.tval  = '0;
         end
         else begin : address_valid
-          // NOTE: indexed are not covered here
-          automatic logic [riscv::VLEN-1:0] vaddr_start;
-
-          case (pe_req_q.op)
-            // Unit-stride: address = base + (vstart in elements)
-            VLE,  VSE : vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew));
-            // Strided: address = base + (vstart * stride)
-            // NOTE: this multiplier might cause some timing issues
-            VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart * pe_req_q.stride);
-            // Indexed: let the next stage take care of vstart
-            VLXE, VSXE: vaddr_start = pe_req_q.scalar_op;
-            default   : vaddr_start = '0;
-          endcase // pe_req_q.op
-
-          addrgen_req = '{
-            addr    : vaddr_start,
-            len     : pe_req_q.vl - pe_req_q.vstart,
-            stride  : pe_req_q.stride,
-            vew     : pe_req_q.vtype.vsew,
-            is_load : is_load(pe_req_q.op),
-            // Unit-strided loads/stores trigger incremental AXI bursts.
-            is_burst: (pe_req_q.op inside {VLE, VSE}),
-            vstart  : pe_req_q.vstart
-          };
           addrgen_req_valid = 1'b1;
 
           if (addrgen_req_ready) begin : finished
@@ -343,9 +345,9 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
         // Every address can generate an exception
         addrgen_req = '{
           addr    : pe_req_q.scalar_op,
-          len     : pe_req_q.vl - pe_req_q.vstart,
+          len     : (pe_req_q.vl - pe_req_q.vstart) << unsigned'(pe_req_q.vtype.vsew[1:0]),
           stride  : pe_req_q.stride,
-          vew     : pe_req_q.vtype.vsew,
+          vew     : pe_req_q.vtype.vsew[1:0],
           is_load : is_load(pe_req_q.op),
           // Unit-strided loads/stores trigger incremental AXI bursts.
           is_burst: 1'b0,
@@ -567,7 +569,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
 
   function automatic void set_end_addr (
       input  logic [($bits(axi_addr_t) - 12)-1:0]       next_2page_msb,
-      input  int unsigned                               num_bytes,
+      input  vlen_t                                     num_bytes,
       input  axi_addr_t                                 addr,
       input  logic [clog2_AxiStrobeWidth:0]             eff_axi_dw,
       input  logic [idx_width(clog2_AxiStrobeWidth):0]  eff_axi_dw_log,
@@ -641,8 +643,30 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
         // Clear exception buffer
         mmu_exception_d = '0;
 
+        // This computation is timing-critical. Look ahead and compute even if addr not valid.
+        axi_addrgen_d = addrgen_req;
+
+        // The start address is found by aligning the original request address by the width of
+        // the memory interface.
+        aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, clog2_AxiStrobeWidth);
+        // Pre-calculate the next_2page_msb. This should not require much energy if the addr
+        // has zeroes in the upper positions.
+        // We can use this also for the misaligned address calculation, as the next 2 page msb
+        // will be the same either way.
+        next_2page_msb_d = aligned_start_addr_d[AxiAddrWidth-1:12] + 1;
+        // The final address can be found similarly...
+        set_end_addr (
+          next_2page_msb_d,
+          axi_addrgen_d.len,
+          axi_addrgen_d.addr,
+          AxiDataWidth/8,
+          clog2_AxiStrobeWidth,
+          aligned_start_addr_d,
+          aligned_end_addr_d,
+          aligned_next_start_addr_d
+        );
+
         if (addrgen_req_valid) begin
-          axi_addrgen_d       = addrgen_req;
           axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING_CORE_STORE_PENDING : AXI_ADDRGEN_REQUESTING;
 
           // In case of a misaligned store, reduce the effective width of the AXI transaction,
@@ -664,26 +688,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
             eff_axi_dw_d     = AxiDataWidth/8;
             eff_axi_dw_log_d = clog2_AxiStrobeWidth;
           end
-
-          // The start address is found by aligning the original request address by the width of
-          // the memory interface.
-          aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, clog2_AxiStrobeWidth);
-          // Pre-calculate the next_2page_msb. This should not require much energy if the addr
-          // has zeroes in the upper positions.
-          // We can use this also for the misaligned address calculation, as the next 2 page msb
-          // will be the same either way.
-          next_2page_msb_d = aligned_start_addr_d[AxiAddrWidth-1:12] + 1;
-          // The final address can be found similarly...
-          set_end_addr (
-            next_2page_msb_d,
-            (axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)),
-            axi_addrgen_d.addr,
-            AxiDataWidth/8,
-            clog2_AxiStrobeWidth,
-            aligned_start_addr_d,
-            aligned_end_addr_d,
-            aligned_next_start_addr_d
-          );
         end
       end : axi_addrgen_state_AXI_ADDRGEN_IDLE
 
@@ -696,7 +700,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
 
         set_end_addr (
           next_2page_msb_q,
-          (axi_addrgen_q.len << unsigned'(axi_addrgen_q.vew)),
+          axi_addrgen_q.len,
           axi_addrgen_q.addr,
           eff_axi_dw_q,
           eff_axi_dw_log_q,
@@ -714,11 +718,20 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
 
       AXI_ADDRGEN_REQUESTING : begin : axi_addrgen_state_AXI_ADDRGEN_REQUESTING
         automatic logic axi_ax_ready = (axi_addrgen_q.is_load && axi_ar_ready_i) || (!axi_addrgen_q.is_load && axi_aw_ready_i);
+        automatic logic [11:0] num_bytes;
+        automatic vlen_t remaining_bytes;
 
         // Pre-calculate the next_2page_msb. This should not require much energy if the addr
         // has zeroes in the upper positions.
         next_2page_msb_d = aligned_next_start_addr_q[AxiAddrWidth-1:12] + 1;
 
+        // Pre-calculate the bytes used in a unit-strided access and the remaining bytes to ask
+        num_bytes = aligned_next_start_addr_q[11:0] - axi_addrgen_q.addr[11:0];
+        remaining_bytes = axi_addrgen_q.len - num_bytes;
+        if (axi_addrgen_q.len < num_bytes) begin
+          remaining_bytes = 0;
+        end
+
         // Before starting a transaction on a different channel, wait the formers to complete
         // Otherwise, the ordering of the responses is not guaranteed, and with the current
         // implementation we can incur in deadlocks
@@ -748,9 +761,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
 
                 // NOTE: all these variables could be narrowed to the minimum number of bits
                 automatic int unsigned num_beats;
-                automatic int unsigned num_bytes;
-                automatic int unsigned burst_len_bytes;
-                automatic int unsigned axi_addrgen_bytes;
 
                 // AXI burst length
                 automatic int unsigned burst_length;
@@ -802,32 +812,21 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
                 };
                 axi_addrgen_queue_push = 1'b1;
 
-                // Account for the requested operands
-                num_bytes = ((aligned_end_addr_q[11:0] - axi_addrgen_q.addr[11:0] + 1) >> unsigned'(axi_addrgen_q.vew));
-                if (axi_addrgen_q.len >= num_bytes) begin
-                  axi_addrgen_d.len = axi_addrgen_q.len - num_bytes;
-                end
-                else begin
-                  axi_addrgen_d.len = 0;
-                end
+                axi_addrgen_d.len  = remaining_bytes;
                 axi_addrgen_d.addr = aligned_next_start_addr_q;
 
                 // Calculate the addresses for the next iteration
                 // TODO: test this for SEW!=64, otherwise this computation is never used
                 // The start address is found by aligning the original request address by the width of
                 // the memory interface. In our case, we have it already.
                 aligned_start_addr_d = axi_addrgen_d.addr;
-                // The final address can be found similarly.
-                // How many B we requested? No more than (256 << burst_len_bytes)
-                burst_len_bytes   = (256 << eff_axi_dw_log_q);
-                axi_addrgen_bytes = (axi_addrgen_d.len << unsigned'(axi_addrgen_q.vew));
                 set_end_addr (
                   next_2page_msb_d,
-                  (axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)),
-                  aligned_start_addr_d,
+                  axi_addrgen_q.len - num_bytes,
+                  aligned_next_start_addr_q,
                   eff_axi_dw_q,
                   eff_axi_dw_log_q,
-                  aligned_start_addr_d,
+                  aligned_next_start_addr_q,
                   aligned_end_addr_d,
                   aligned_next_start_addr_d
                 );
@@ -872,7 +871,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
                 axi_addrgen_queue_push = 1'b1;
 
                 // Account for the requested operands
-                axi_addrgen_d.len  = axi_addrgen_q.len - 1;
+                // This should never overflow
+                axi_addrgen_d.len = axi_addrgen_q.len - (1 << axi_addrgen_q.vew);
                 // Calculate the addresses for the next iteration, adding the correct stride
                 axi_addrgen_d.addr = axi_addrgen_q.addr + axi_addrgen_q.stride;
               end : strided
@@ -894,7 +894,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
                     // Generate an error
                     idx_op_error_d          = 1'b1;
                     // Forward next vstart info to the dispatcher
-                    addrgen_exception_vstart_d  = addrgen_req.len - axi_addrgen_q.len - 1;
+                    addrgen_exception_vstart_d  = (addrgen_req.len - axi_addrgen_q.len) >> axi_addrgen_q.vew - 1;
                     addrgen_req_ready       = 1'b1;
                     axi_addrgen_state_d     = AXI_ADDRGEN_IDLE;
                   end : eew_misaligned_error
@@ -941,7 +941,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
                     axi_addrgen_queue_push = 1'b1;
 
                     // Account for the requested operands
-                    axi_addrgen_d.len = axi_addrgen_q.len - 1;
+                    // This should never overflow
+                    axi_addrgen_d.len = axi_addrgen_q.len - (1 << axi_addrgen_q.vew);
                   end : aligned_vaddress
                 end : if_idx_vaddr_valid_q
               end : indexed
@@ -975,7 +976,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
               // Set vstart: vl minus how many elements we have left
               // NOTE: this added complexity only comes from the fact that the beat counting
               //       implementation counts down from the expected length, instead that up from zero
-              addrgen_exception_vstart_d  = pe_req_q.vl - axi_addrgen_q.len;
+              addrgen_exception_vstart_d  = pe_req_q.vl - (axi_addrgen_q.len >> axi_addrgen_q.vew);
 
               // Mute new requests
               mmu_req_o = 1'b0;