Skip to content

Commit

Permalink
Fix HWPE MAC output streamer for double word writes (pulp-platform#20)
Browse files Browse the repository at this point in the history
* hw: Add special switcher for output streamer

* hw: Add switcher for double word alignment

* hw: Decouple input and output streams in SNAX MAC

* hw: Fix bulk changes to allow 64-bit input and output streaming of HWPE MAC

* sw: Move from 32 bit to 64 bit data

* Fix parameters and comments
  • Loading branch information
rgantonio authored and JosseVanDelm committed Dec 11, 2023
1 parent 8fa377e commit 3163dcf
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 30 deletions.
19 changes: 17 additions & 2 deletions hw/snax_hwpe_mac/src/snax_hwpe_ctrl.sv
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ module snax_hwpe_ctrl #(
// Number of bits to fill to extend to DataWidth
localparam int unsigned FillBits = DataWidth - 32;

// CSR addresses for HWPE register mappings
localparam int unsigned HwpeStreamAddrA = 32'd64;
localparam int unsigned HwpeStreamAddrB = 32'd68;
localparam int unsigned HwpeStreamAddrC = 32'd72;
localparam int unsigned HwpeStreamAddrOut = 32'd76;

//---------------------------------------------
// Registers and wires
//---------------------------------------------
Expand Down Expand Up @@ -115,10 +121,17 @@ module snax_hwpe_ctrl #(
endcase
end

// If the writes are towards address settings
// Make sure to trigger the flag so we can offset the addresses
logic address_register;
assign address_register = ( address_in == HwpeStreamAddrA
|| address_in == HwpeStreamAddrB
|| address_in == HwpeStreamAddrC
|| address_in == HwpeStreamAddrOut );

// Byte enable always only when we need to write
assign be = (is_write) ? 4'hF : 4'h0;


// States
typedef enum logic [1:0] {
WAIT,
Expand Down Expand Up @@ -214,7 +227,9 @@ module snax_hwpe_ctrl #(
periph.add <= address_in;
periph.wen <= wen;
periph.be <= be;
periph.data <= req_i.data_arga[31:0];
// If the CSR update is for a address setting,
// then we align to double (64 bits)
periph.data <= (address_register) ? {req_i.data_arga[31:3],3'b000} >> 1: req_i.data_arga[31:0];
end
end
WRITE: begin
Expand Down
10 changes: 7 additions & 3 deletions hw/snax_hwpe_mac/src/snax_hwpe_to_reqrsp.sv
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import reqrsp_pkg::*;
module snax_hwpe_to_reqrsp #(
parameter int unsigned AddrWidth = 48,
parameter int unsigned DataWidth = 64,
parameter bit AlignOutputDouble = 0, // This triggers the x2 alignment for the addreses for doubles
parameter type tcdm_req_t = logic, // Memory request payload type, usually write enable, write data, etc.
parameter type tcdm_rsp_t = logic // Memory response payload type, usually read data
)(
Expand Down Expand Up @@ -66,7 +67,10 @@ module snax_hwpe_to_reqrsp #(
assign fifo_hwpe_tcdm_data_in.valid = hwpe_tcdm_slave.gnt & hwpe_tcdm_slave.req;

// Unpack
assign unpack_addr = fifo_hwpe_tcdm_data_out.add;
// Align the address to double if the HWPE stream is an output
// The incoming HWPE addresses are in multiples of 4
// Make them multiples of 8 by simply multiplying by 2
assign unpack_addr = (AlignOutputDouble) ? fifo_hwpe_tcdm_data_out.add << 1 : fifo_hwpe_tcdm_data_out.add;
assign tcdm_req_o.q.write = fifo_hwpe_tcdm_data_out.wen;
assign strb = fifo_hwpe_tcdm_data_out.be;
assign unpack_data = fifo_hwpe_tcdm_data_out.data;
Expand Down Expand Up @@ -173,7 +177,7 @@ module snax_hwpe_to_reqrsp #(
//
//---------------------------------------------

assign hwpe_tcdm_slave.r_data = (fifo_addr_out[2]) ? tcdm_rsp_i.p.data[63:32] : tcdm_rsp_i.p.data[31:0];
assign hwpe_tcdm_slave.r_data = tcdm_rsp_i.p.data[31:0];
assign hwpe_tcdm_slave.r_valid = tcdm_rsp_i.p_valid;

//---------------------------------------------
Expand All @@ -183,7 +187,7 @@ module snax_hwpe_to_reqrsp #(
assign tcdm_req_o.q.addr = {{31{1'b0}},unpack_addr};
assign tcdm_req_o.q.data = {{31{1'b0}},unpack_data};
assign tcdm_req_o.q.amo = AMONone;
assign tcdm_req_o.q.strb = {StrbWidth{strb}};
assign tcdm_req_o.q.strb = '1; //Byte strobes are always valid
assign tcdm_req_o.q.user = '0;

// verilog_lint: waive-stop line-length
Expand Down
5 changes: 3 additions & 2 deletions hw/snax_hwpe_mac/src/snax_mac.sv
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

module snax_mac # (
parameter int unsigned DataWidth = 32,
parameter int unsigned SnaxTcdmPorts = 4,
parameter int unsigned SnaxTcdmPorts = 4,
parameter type acc_req_t = logic,
parameter type acc_rsp_t = logic,
parameter type tcdm_req_t = logic,
Expand Down Expand Up @@ -135,7 +135,8 @@ module snax_mac # (
snax_hwpe_to_reqrsp #(
.DataWidth ( DataWidth ), // Data width to use
.tcdm_req_t ( tcdm_req_t ), // TCDM request type
.tcdm_rsp_t ( tcdm_rsp_t ) // TCDM response type
.tcdm_rsp_t ( tcdm_rsp_t ), // TCDM response type
.AlignOutputDouble( 1'b1 ) // Align data to double word
) i_snax_hwpe_to_reqrsp (
.clk_i ( clk_i ), // Clock
.rst_ni ( rst_ni ), // Asynchronous reset, active low
Expand Down
8 changes: 4 additions & 4 deletions target/snitch_cluster/sw/apps/snax-mac/data/data.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
uint32_t VEC_LEN = 20;
uint32_t A[] = {99, 67, 39, 26, 62, 14, 17, 18, 54, 16,
uint64_t A[] = {99, 67, 39, 26, 62, 14, 17, 18, 54, 16,
44, 9, 26, 85, 72, 66, 95, 65, 43, 84};
uint32_t B[] = {86, 10, 14, 11, 38, 41, 94, 82, 97, 25,
uint64_t B[] = {86, 10, 14, 11, 38, 41, 94, 82, 97, 25,
96, 71, 44, 59, 93, 38, 57, 21, 84, 29};
uint32_t C = 51;
uint32_t OUT;
uint64_t C = 51;
uint64_t OUT;
29 changes: 10 additions & 19 deletions target/snitch_cluster/sw/apps/snax-mac/src/snax-mac.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ int main() {
// Set err value for checking
int err = 0;

uint32_t final_output;
uint64_t final_output;

uint32_t *local_a, *local_b, *local_c, *local_o;
uint64_t *local_a, *local_b, *local_c, *local_o;

// Allocate space in TCDM
local_a = (uint32_t *)snrt_l1_next();
local_a = (uint64_t *)snrt_l1_next();
local_b = local_a + VEC_LEN;
local_c = local_b + VEC_LEN;
local_o = local_c + 1;
Expand All @@ -24,8 +24,8 @@ int main() {

// Use data mover core to bring data from L3 to TCDM
if (snrt_is_dm_core()) {
size_t vector_size = VEC_LEN * sizeof(uint32_t);
size_t scale_size = 1 * sizeof(uint32_t);
size_t vector_size = VEC_LEN * sizeof(uint64_t);
size_t scale_size = 1 * sizeof(uint64_t);
snrt_dma_start_1d(local_a, A, vector_size);
snrt_dma_start_1d(local_b, B, vector_size);
snrt_dma_start_1d(local_c, &C, scale_size);
Expand All @@ -43,10 +43,10 @@ int main() {
uint32_t csr_set = snrt_mcycle();

// Set addresses
write_csr(0x3d0, (uint32_t)local_a);
write_csr(0x3d1, (uint32_t)local_b);
write_csr(0x3d2, (uint32_t)local_c);
write_csr(0x3d3, (uint32_t)local_o);
write_csr(0x3d0, (uint64_t)local_a);
write_csr(0x3d1, (uint64_t)local_b);
write_csr(0x3d2, (uint64_t)local_c);
write_csr(0x3d3, (uint64_t)local_o);

// Set configs
write_csr(0x3d4, 1); // Number of iterations
Expand All @@ -70,16 +70,7 @@ int main() {

uint32_t mac_end = snrt_mcycle();

// Data memory is 64-bits per access, hence it is double word
// addressable but HWPE accelerator and snitch cores are
// 32-bits (word) addressable. If output address is
// divisble by 8, we read normally; otherwise, we get
// the lower 32-bits (get the lower word address)
if (((uint32_t)local_o) % 8) {
final_output = *(local_o - 1);
} else {
final_output = *local_o;
};
final_output = *local_o;

if (final_output != 54763) {
err = 1;
Expand Down

0 comments on commit 3163dcf

Please sign in to comment.