diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index abc628b7f..4d744c1a4 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -81,7 +81,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( import axi_pkg::CACHE_MODIFIABLE; // Check if the address is aligned to a particular width - function automatic logic is_addr_error(axi_addr_t addr, vew_e vew); + function automatic logic is_addr_error(axi_addr_t addr, logic [1:0] vew); is_addr_error = |(addr & (elen_t'(1 << vew) - 1)); endfunction // is_addr_error @@ -95,7 +95,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( axi_addr_t addr; vlen_t len; elen_t stride; - vew_e vew; + logic [1:0] vew; // Support only up to 64-bit logic is_load; logic is_burst; // Unit-strided instructions can be converted into AXI INCR bursts vlen_t vstart; @@ -280,6 +280,32 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( end ADDRGEN: begin + // NOTE: indexed are not covered here + automatic logic [riscv::VLEN-1:0] vaddr_start; + + case (pe_req_q.op) + // Unit-stride: address = base + (vstart in elements) + VLE, VSE : vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew)); + // Strided: address = base + (vstart * stride) + // NOTE: this multiplier might cause some timing issues + VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart * pe_req_q.stride); + // Indexed: let the next stage take care of vstart + VLXE, VSXE: vaddr_start = pe_req_q.scalar_op; + default : vaddr_start = '0; + endcase // pe_req_q.op + + // Start the computation already + addrgen_req = '{ + addr : vaddr_start, + len : (pe_req_q.vl - pe_req_q.vstart) << unsigned'(pe_req_q.vtype.vsew[1:0]), + stride : pe_req_q.stride, + vew : pe_req_q.vtype.vsew[1:0], + is_load : is_load(pe_req_q.op), + // Unit-strided loads/stores trigger incremental AXI bursts. + is_burst: (pe_req_q.op inside {VLE, VSE}), + vstart : pe_req_q.vstart + }; + // Ara does not support misaligned AXI requests if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin state_d = IDLE; @@ -289,30 +315,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( addrgen_exception_o.tval = '0; end else begin : address_valid - // NOTE: indexed are not covered here - automatic logic [riscv::VLEN-1:0] vaddr_start; - - case (pe_req_q.op) - // Unit-stride: address = base + (vstart in elements) - VLE, VSE : vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew)); - // Strided: address = base + (vstart * stride) - // NOTE: this multiplier might cause some timing issues - VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart * pe_req_q.stride); - // Indexed: let the next stage take care of vstart - VLXE, VSXE: vaddr_start = pe_req_q.scalar_op; - default : vaddr_start = '0; - endcase // pe_req_q.op - - addrgen_req = '{ - addr : vaddr_start, - len : pe_req_q.vl - pe_req_q.vstart, - stride : pe_req_q.stride, - vew : pe_req_q.vtype.vsew, - is_load : is_load(pe_req_q.op), - // Unit-strided loads/stores trigger incremental AXI bursts. - is_burst: (pe_req_q.op inside {VLE, VSE}), - vstart : pe_req_q.vstart - }; addrgen_req_valid = 1'b1; if (addrgen_req_ready) begin : finished @@ -343,9 +345,9 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Every address can generate an exception addrgen_req = '{ addr : pe_req_q.scalar_op, - len : pe_req_q.vl - pe_req_q.vstart, + len : (pe_req_q.vl - pe_req_q.vstart) << unsigned'(pe_req_q.vtype.vsew[1:0]), stride : pe_req_q.stride, - vew : pe_req_q.vtype.vsew, + vew : pe_req_q.vtype.vsew[1:0], is_load : is_load(pe_req_q.op), // Unit-strided loads/stores trigger incremental AXI bursts. is_burst: 1'b0, @@ -567,7 +569,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( function automatic void set_end_addr ( input logic [($bits(axi_addr_t) - 12)-1:0] next_2page_msb, - input int unsigned num_bytes, + input vlen_t num_bytes, input axi_addr_t addr, input logic [clog2_AxiStrobeWidth:0] eff_axi_dw, input logic [idx_width(clog2_AxiStrobeWidth):0] eff_axi_dw_log, @@ -641,8 +643,30 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Clear exception buffer mmu_exception_d = '0; + // This computation is timing-critical. Look ahead and compute even if addr not valid. + axi_addrgen_d = addrgen_req; + + // The start address is found by aligning the original request address by the width of + // the memory interface. + aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, clog2_AxiStrobeWidth); + // Pre-calculate the next_2page_msb. This should not require much energy if the addr + // has zeroes in the upper positions. + // We can use this also for the misaligned address calculation, as the next 2 page msb + // will be the same either way. + next_2page_msb_d = aligned_start_addr_d[AxiAddrWidth-1:12] + 1; + // The final address can be found similarly... + set_end_addr ( + next_2page_msb_d, + axi_addrgen_d.len, + axi_addrgen_d.addr, + AxiDataWidth/8, + clog2_AxiStrobeWidth, + aligned_start_addr_d, + aligned_end_addr_d, + aligned_next_start_addr_d + ); + if (addrgen_req_valid) begin - axi_addrgen_d = addrgen_req; axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING_CORE_STORE_PENDING : AXI_ADDRGEN_REQUESTING; // In case of a misaligned store, reduce the effective width of the AXI transaction, @@ -664,26 +688,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( eff_axi_dw_d = AxiDataWidth/8; eff_axi_dw_log_d = clog2_AxiStrobeWidth; end - - // The start address is found by aligning the original request address by the width of - // the memory interface. - aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, clog2_AxiStrobeWidth); - // Pre-calculate the next_2page_msb. This should not require much energy if the addr - // has zeroes in the upper positions. - // We can use this also for the misaligned address calculation, as the next 2 page msb - // will be the same either way. - next_2page_msb_d = aligned_start_addr_d[AxiAddrWidth-1:12] + 1; - // The final address can be found similarly... - set_end_addr ( - next_2page_msb_d, - (axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)), - axi_addrgen_d.addr, - AxiDataWidth/8, - clog2_AxiStrobeWidth, - aligned_start_addr_d, - aligned_end_addr_d, - aligned_next_start_addr_d - ); end end : axi_addrgen_state_AXI_ADDRGEN_IDLE @@ -696,7 +700,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( set_end_addr ( next_2page_msb_q, - (axi_addrgen_q.len << unsigned'(axi_addrgen_q.vew)), + axi_addrgen_q.len, axi_addrgen_q.addr, eff_axi_dw_q, eff_axi_dw_log_q, @@ -714,11 +718,20 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( AXI_ADDRGEN_REQUESTING : begin : axi_addrgen_state_AXI_ADDRGEN_REQUESTING automatic logic axi_ax_ready = (axi_addrgen_q.is_load && axi_ar_ready_i) || (!axi_addrgen_q.is_load && axi_aw_ready_i); + automatic logic [11:0] num_bytes; + automatic vlen_t remaining_bytes; // Pre-calculate the next_2page_msb. This should not require much energy if the addr // has zeroes in the upper positions. next_2page_msb_d = aligned_next_start_addr_q[AxiAddrWidth-1:12] + 1; + // Pre-calculate the bytes used in a unit-strided access and the remaining bytes to ask + num_bytes = aligned_next_start_addr_q[11:0] - axi_addrgen_q.addr[11:0]; + remaining_bytes = axi_addrgen_q.len - num_bytes; + if (axi_addrgen_q.len < num_bytes) begin + remaining_bytes = 0; + end + // Before starting a transaction on a different channel, wait the formers to complete // Otherwise, the ordering of the responses is not guaranteed, and with the current // implementation we can incur in deadlocks @@ -748,9 +761,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // NOTE: all these variables could be narrowed to the minimum number of bits automatic int unsigned num_beats; - automatic int unsigned num_bytes; - automatic int unsigned burst_len_bytes; - automatic int unsigned axi_addrgen_bytes; // AXI burst length automatic int unsigned burst_length; @@ -802,14 +812,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( }; axi_addrgen_queue_push = 1'b1; - // Account for the requested operands - num_bytes = ((aligned_end_addr_q[11:0] - axi_addrgen_q.addr[11:0] + 1) >> unsigned'(axi_addrgen_q.vew)); - if (axi_addrgen_q.len >= num_bytes) begin - axi_addrgen_d.len = axi_addrgen_q.len - num_bytes; - end - else begin - axi_addrgen_d.len = 0; - end + axi_addrgen_d.len = remaining_bytes; axi_addrgen_d.addr = aligned_next_start_addr_q; // Calculate the addresses for the next iteration @@ -817,17 +820,13 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // The start address is found by aligning the original request address by the width of // the memory interface. In our case, we have it already. aligned_start_addr_d = axi_addrgen_d.addr; - // The final address can be found similarly. - // How many B we requested? No more than (256 << burst_len_bytes) - burst_len_bytes = (256 << eff_axi_dw_log_q); - axi_addrgen_bytes = (axi_addrgen_d.len << unsigned'(axi_addrgen_q.vew)); set_end_addr ( next_2page_msb_d, - (axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)), - aligned_start_addr_d, + axi_addrgen_q.len - num_bytes, + aligned_next_start_addr_q, eff_axi_dw_q, eff_axi_dw_log_q, - aligned_start_addr_d, + aligned_next_start_addr_q, aligned_end_addr_d, aligned_next_start_addr_d ); @@ -872,7 +871,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( axi_addrgen_queue_push = 1'b1; // Account for the requested operands - axi_addrgen_d.len = axi_addrgen_q.len - 1; + // This should never overflow + axi_addrgen_d.len = axi_addrgen_q.len - (1 << axi_addrgen_q.vew); // Calculate the addresses for the next iteration, adding the correct stride axi_addrgen_d.addr = axi_addrgen_q.addr + axi_addrgen_q.stride; end : strided @@ -894,7 +894,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Generate an error idx_op_error_d = 1'b1; // Forward next vstart info to the dispatcher - addrgen_exception_vstart_d = addrgen_req.len - axi_addrgen_q.len - 1; + addrgen_exception_vstart_d = (addrgen_req.len - axi_addrgen_q.len) >> axi_addrgen_q.vew - 1; addrgen_req_ready = 1'b1; axi_addrgen_state_d = AXI_ADDRGEN_IDLE; end : eew_misaligned_error @@ -941,7 +941,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( axi_addrgen_queue_push = 1'b1; // Account for the requested operands - axi_addrgen_d.len = axi_addrgen_q.len - 1; + // This should never overflow + axi_addrgen_d.len = axi_addrgen_q.len - (1 << axi_addrgen_q.vew); end : aligned_vaddress end : if_idx_vaddr_valid_q end : indexed @@ -975,7 +976,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Set vstart: vl minus how many elements we have left // NOTE: this added complexity only comes from the fact that the beat counting // implementation counts down from the expected length, instead that up from zero - addrgen_exception_vstart_d = pe_req_q.vl - axi_addrgen_q.len; + addrgen_exception_vstart_d = pe_req_q.vl - (axi_addrgen_q.len >> axi_addrgen_q.vew); // Mute new requests mmu_req_o = 1'b0;