Skip to content

Commit

Permalink
[hardware] Relax tight addrgen timing paths
Browse files Browse the repository at this point in the history
  • Loading branch information
mp-17 committed Oct 31, 2024
1 parent 4f9d3a9 commit d04f2c8
Showing 1 changed file with 74 additions and 73 deletions.
147 changes: 74 additions & 73 deletions hardware/src/vlsu/addrgen.sv
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
import axi_pkg::CACHE_MODIFIABLE;

// Check if the address is aligned to a particular width
function automatic logic is_addr_error(axi_addr_t addr, vew_e vew);
function automatic logic is_addr_error(axi_addr_t addr, logic [1:0] vew);
is_addr_error = |(addr & (elen_t'(1 << vew) - 1));
endfunction // is_addr_error

Expand All @@ -95,7 +95,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
axi_addr_t addr;
vlen_t len;
elen_t stride;
vew_e vew;
logic [1:0] vew; // Support only up to 64-bit
logic is_load;
logic is_burst; // Unit-strided instructions can be converted into AXI INCR bursts
vlen_t vstart;
Expand Down Expand Up @@ -280,6 +280,32 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
end

ADDRGEN: begin
// NOTE: indexed are not covered here
automatic logic [riscv::VLEN-1:0] vaddr_start;

case (pe_req_q.op)
// Unit-stride: address = base + (vstart in elements)
VLE, VSE : vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew));
// Strided: address = base + (vstart * stride)
// NOTE: this multiplier might cause some timing issues
VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart * pe_req_q.stride);
// Indexed: let the next stage take care of vstart
VLXE, VSXE: vaddr_start = pe_req_q.scalar_op;
default : vaddr_start = '0;
endcase // pe_req_q.op

// Start the computation already
addrgen_req = '{
addr : vaddr_start,
len : (pe_req_q.vl - pe_req_q.vstart) << unsigned'(pe_req_q.vtype.vsew[1:0]),
stride : pe_req_q.stride,
vew : pe_req_q.vtype.vsew[1:0],
is_load : is_load(pe_req_q.op),
// Unit-strided loads/stores trigger incremental AXI bursts.
is_burst: (pe_req_q.op inside {VLE, VSE}),
vstart : pe_req_q.vstart
};

// Ara does not support misaligned AXI requests
if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin
state_d = IDLE;
Expand All @@ -289,30 +315,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
addrgen_exception_o.tval = '0;
end
else begin : address_valid
// NOTE: indexed are not covered here
automatic logic [riscv::VLEN-1:0] vaddr_start;

case (pe_req_q.op)
// Unit-stride: address = base + (vstart in elements)
VLE, VSE : vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew));
// Strided: address = base + (vstart * stride)
// NOTE: this multiplier might cause some timing issues
VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + (pe_req_q.vstart * pe_req_q.stride);
// Indexed: let the next stage take care of vstart
VLXE, VSXE: vaddr_start = pe_req_q.scalar_op;
default : vaddr_start = '0;
endcase // pe_req_q.op

addrgen_req = '{
addr : vaddr_start,
len : pe_req_q.vl - pe_req_q.vstart,
stride : pe_req_q.stride,
vew : pe_req_q.vtype.vsew,
is_load : is_load(pe_req_q.op),
// Unit-strided loads/stores trigger incremental AXI bursts.
is_burst: (pe_req_q.op inside {VLE, VSE}),
vstart : pe_req_q.vstart
};
addrgen_req_valid = 1'b1;

if (addrgen_req_ready) begin : finished
Expand Down Expand Up @@ -343,9 +345,9 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
// Every address can generate an exception
addrgen_req = '{
addr : pe_req_q.scalar_op,
len : pe_req_q.vl - pe_req_q.vstart,
len : (pe_req_q.vl - pe_req_q.vstart) << unsigned'(pe_req_q.vtype.vsew[1:0]),
stride : pe_req_q.stride,
vew : pe_req_q.vtype.vsew,
vew : pe_req_q.vtype.vsew[1:0],
is_load : is_load(pe_req_q.op),
// Unit-strided loads/stores trigger incremental AXI bursts.
is_burst: 1'b0,
Expand Down Expand Up @@ -567,7 +569,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(

function automatic void set_end_addr (
input logic [($bits(axi_addr_t) - 12)-1:0] next_2page_msb,
input int unsigned num_bytes,
input vlen_t num_bytes,
input axi_addr_t addr,
input logic [clog2_AxiStrobeWidth:0] eff_axi_dw,
input logic [idx_width(clog2_AxiStrobeWidth):0] eff_axi_dw_log,
Expand Down Expand Up @@ -641,8 +643,30 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
// Clear exception buffer
mmu_exception_d = '0;

// This computation is timing-critical. Look ahead and compute even if addr not valid.
axi_addrgen_d = addrgen_req;

// The start address is found by aligning the original request address by the width of
// the memory interface.
aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, clog2_AxiStrobeWidth);
// Pre-calculate the next_2page_msb. This should not require much energy if the addr
// has zeroes in the upper positions.
// We can use this also for the misaligned address calculation, as the next 2 page msb
// will be the same either way.
next_2page_msb_d = aligned_start_addr_d[AxiAddrWidth-1:12] + 1;
// The final address can be found similarly...
set_end_addr (
next_2page_msb_d,
axi_addrgen_d.len,
axi_addrgen_d.addr,
AxiDataWidth/8,
clog2_AxiStrobeWidth,
aligned_start_addr_d,
aligned_end_addr_d,
aligned_next_start_addr_d
);

if (addrgen_req_valid) begin
axi_addrgen_d = addrgen_req;
axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING_CORE_STORE_PENDING : AXI_ADDRGEN_REQUESTING;

// In case of a misaligned store, reduce the effective width of the AXI transaction,
Expand All @@ -664,26 +688,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
eff_axi_dw_d = AxiDataWidth/8;
eff_axi_dw_log_d = clog2_AxiStrobeWidth;
end

// The start address is found by aligning the original request address by the width of
// the memory interface.
aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, clog2_AxiStrobeWidth);
// Pre-calculate the next_2page_msb. This should not require much energy if the addr
// has zeroes in the upper positions.
// We can use this also for the misaligned address calculation, as the next 2 page msb
// will be the same either way.
next_2page_msb_d = aligned_start_addr_d[AxiAddrWidth-1:12] + 1;
// The final address can be found similarly...
set_end_addr (
next_2page_msb_d,
(axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)),
axi_addrgen_d.addr,
AxiDataWidth/8,
clog2_AxiStrobeWidth,
aligned_start_addr_d,
aligned_end_addr_d,
aligned_next_start_addr_d
);
end
end : axi_addrgen_state_AXI_ADDRGEN_IDLE

Expand All @@ -696,7 +700,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(

set_end_addr (
next_2page_msb_q,
(axi_addrgen_q.len << unsigned'(axi_addrgen_q.vew)),
axi_addrgen_q.len,
axi_addrgen_q.addr,
eff_axi_dw_q,
eff_axi_dw_log_q,
Expand All @@ -714,11 +718,20 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(

AXI_ADDRGEN_REQUESTING : begin : axi_addrgen_state_AXI_ADDRGEN_REQUESTING
automatic logic axi_ax_ready = (axi_addrgen_q.is_load && axi_ar_ready_i) || (!axi_addrgen_q.is_load && axi_aw_ready_i);
automatic logic [11:0] num_bytes;
automatic vlen_t remaining_bytes;

// Pre-calculate the next_2page_msb. This should not require much energy if the addr
// has zeroes in the upper positions.
next_2page_msb_d = aligned_next_start_addr_q[AxiAddrWidth-1:12] + 1;

// Pre-calculate the bytes used in a unit-strided access and the remaining bytes to ask
num_bytes = aligned_next_start_addr_q[11:0] - axi_addrgen_q.addr[11:0];
remaining_bytes = axi_addrgen_q.len - num_bytes;
if (axi_addrgen_q.len < num_bytes) begin
remaining_bytes = 0;
end

// Before starting a transaction on a different channel, wait the formers to complete
// Otherwise, the ordering of the responses is not guaranteed, and with the current
// implementation we can incur in deadlocks
Expand Down Expand Up @@ -748,9 +761,6 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(

// NOTE: all these variables could be narrowed to the minimum number of bits
automatic int unsigned num_beats;
automatic int unsigned num_bytes;
automatic int unsigned burst_len_bytes;
automatic int unsigned axi_addrgen_bytes;

// AXI burst length
automatic int unsigned burst_length;
Expand Down Expand Up @@ -802,32 +812,21 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
};
axi_addrgen_queue_push = 1'b1;

// Account for the requested operands
num_bytes = ((aligned_end_addr_q[11:0] - axi_addrgen_q.addr[11:0] + 1) >> unsigned'(axi_addrgen_q.vew));
if (axi_addrgen_q.len >= num_bytes) begin
axi_addrgen_d.len = axi_addrgen_q.len - num_bytes;
end
else begin
axi_addrgen_d.len = 0;
end
axi_addrgen_d.len = remaining_bytes;
axi_addrgen_d.addr = aligned_next_start_addr_q;

// Calculate the addresses for the next iteration
// TODO: test this for SEW!=64, otherwise this computation is never used
// The start address is found by aligning the original request address by the width of
// the memory interface. In our case, we have it already.
aligned_start_addr_d = axi_addrgen_d.addr;
// The final address can be found similarly.
// How many B we requested? No more than (256 << burst_len_bytes)
burst_len_bytes = (256 << eff_axi_dw_log_q);
axi_addrgen_bytes = (axi_addrgen_d.len << unsigned'(axi_addrgen_q.vew));
set_end_addr (
next_2page_msb_d,
(axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)),
aligned_start_addr_d,
axi_addrgen_q.len - num_bytes,
aligned_next_start_addr_q,
eff_axi_dw_q,
eff_axi_dw_log_q,
aligned_start_addr_d,
aligned_next_start_addr_q,
aligned_end_addr_d,
aligned_next_start_addr_d
);
Expand Down Expand Up @@ -872,7 +871,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
axi_addrgen_queue_push = 1'b1;

// Account for the requested operands
axi_addrgen_d.len = axi_addrgen_q.len - 1;
// This should never overflow
axi_addrgen_d.len = axi_addrgen_q.len - (1 << axi_addrgen_q.vew);
// Calculate the addresses for the next iteration, adding the correct stride
axi_addrgen_d.addr = axi_addrgen_q.addr + axi_addrgen_q.stride;
end : strided
Expand All @@ -894,7 +894,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
// Generate an error
idx_op_error_d = 1'b1;
// Forward next vstart info to the dispatcher
addrgen_exception_vstart_d = addrgen_req.len - axi_addrgen_q.len - 1;
addrgen_exception_vstart_d = (addrgen_req.len - axi_addrgen_q.len) >> axi_addrgen_q.vew - 1;
addrgen_req_ready = 1'b1;
axi_addrgen_state_d = AXI_ADDRGEN_IDLE;
end : eew_misaligned_error
Expand Down Expand Up @@ -941,7 +941,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
axi_addrgen_queue_push = 1'b1;

// Account for the requested operands
axi_addrgen_d.len = axi_addrgen_q.len - 1;
// This should never overflow
axi_addrgen_d.len = axi_addrgen_q.len - (1 << axi_addrgen_q.vew);
end : aligned_vaddress
end : if_idx_vaddr_valid_q
end : indexed
Expand Down Expand Up @@ -975,7 +976,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
// Set vstart: vl minus how many elements we have left
// NOTE: this added complexity only comes from the fact that the beat counting
// implementation counts down from the expected length, instead that up from zero
addrgen_exception_vstart_d = pe_req_q.vl - axi_addrgen_q.len;
addrgen_exception_vstart_d = pe_req_q.vl - (axi_addrgen_q.len >> axi_addrgen_q.vew);

// Mute new requests
mmu_req_o = 1'b0;
Expand Down

0 comments on commit d04f2c8

Please sign in to comment.