diff --git a/hw/snax_hwpe_mac/src/snax_hwpe_ctrl.sv b/hw/snax_hwpe_mac/src/snax_hwpe_ctrl.sv index e1769e8ae..6bc55782c 100644 --- a/hw/snax_hwpe_mac/src/snax_hwpe_ctrl.sv +++ b/hw/snax_hwpe_mac/src/snax_hwpe_ctrl.sv @@ -38,12 +38,6 @@ module snax_hwpe_ctrl #( // Number of bits to fill to extend to DataWidth localparam int unsigned FillBits = DataWidth - 32; - // CSR addresses for HWPE register mappings - localparam int unsigned HwpeStreamAddrA = 32'd64; - localparam int unsigned HwpeStreamAddrB = 32'd68; - localparam int unsigned HwpeStreamAddrC = 32'd72; - localparam int unsigned HwpeStreamAddrOut = 32'd76; - //--------------------------------------------- // Registers and wires //--------------------------------------------- @@ -121,14 +115,6 @@ module snax_hwpe_ctrl #( endcase end - // If the writes are towards address settings - // Make sure to trigger the flag so we can offset the addresses - logic address_register; - assign address_register = ( address_in == HwpeStreamAddrA - || address_in == HwpeStreamAddrB - || address_in == HwpeStreamAddrC - || address_in == HwpeStreamAddrOut ); - // Byte enable always only when we need to write assign be = (is_write) ? 4'hF : 4'h0; @@ -227,9 +213,7 @@ module snax_hwpe_ctrl #( periph.add <= address_in; periph.wen <= wen; periph.be <= be; - // If the CSR update is for a address setting, - // then we align to double (64 bits) - periph.data <= (address_register) ? {req_i.data_arga[31:3],3'b000} >> 1: req_i.data_arga[31:0]; + periph.data <= req_i.data_arga[31:0]; end end WRITE: begin diff --git a/hw/snax_hwpe_mac/src/snax_hwpe_to_reqrsp.sv b/hw/snax_hwpe_mac/src/snax_hwpe_to_reqrsp.sv index 286bcfe3b..e241b25fd 100644 --- a/hw/snax_hwpe_mac/src/snax_hwpe_to_reqrsp.sv +++ b/hw/snax_hwpe_mac/src/snax_hwpe_to_reqrsp.sv @@ -12,7 +12,6 @@ import reqrsp_pkg::*; module snax_hwpe_to_reqrsp #( parameter int unsigned AddrWidth = 48, parameter int unsigned DataWidth = 64, - parameter bit AlignOutputDouble = 0, // This triggers the x2 alignment for the addreses for doubles parameter type tcdm_req_t = logic, // Memory request payload type, usually write enable, write data, etc. parameter type tcdm_rsp_t = logic // Memory response payload type, usually read data )( @@ -23,12 +22,6 @@ module snax_hwpe_to_reqrsp #( hwpe_stream_intf_tcdm.slave hwpe_tcdm_slave // periph slave port ); - //--------------------------------------------- - // Some local parameters - //--------------------------------------------- - - localparam int unsigned StrbWidth = (DataWidth/8); - //--------------------------------------------- // Pack, unpack, and some logic //--------------------------------------------- @@ -39,7 +32,7 @@ module snax_hwpe_to_reqrsp #( logic fifo_hwpe_tcdm_empty; logic be; - logic strb; + logic [7:0] strb; logic [31:0] unpack_addr; logic [31:0] unpack_data; @@ -61,18 +54,19 @@ module snax_hwpe_to_reqrsp #( // Pack assign fifo_hwpe_tcdm_data_in.add = hwpe_tcdm_slave.add; - assign fifo_hwpe_tcdm_data_in.wen = !hwpe_tcdm_slave.wen; // Not wen, because HWPE uses wen=0 to write and wen = 1 to read but memory uses wen = 1 to write and wen = 0 to read + // Not wen, because HWPE uses wen=0 to write and wen = 1 to read but memory uses wen = 1 to write and wen = 0 to read + assign fifo_hwpe_tcdm_data_in.wen = !hwpe_tcdm_slave.wen; assign fifo_hwpe_tcdm_data_in.be = be; assign fifo_hwpe_tcdm_data_in.data = hwpe_tcdm_slave.data; assign fifo_hwpe_tcdm_data_in.valid = hwpe_tcdm_slave.gnt & hwpe_tcdm_slave.req; - // Unpack - // Align the address to double if the HWPE stream is an output - // The incoming HWPE addresses are in multiples of 4 - // Make them multiples of 8 by simply multiplying by 2 - assign unpack_addr = (AlignOutputDouble) ? fifo_hwpe_tcdm_data_out.add << 1 : fifo_hwpe_tcdm_data_out.add; + // Re-wiring + assign unpack_addr = fifo_hwpe_tcdm_data_out.add; assign tcdm_req_o.q.write = fifo_hwpe_tcdm_data_out.wen; - assign strb = fifo_hwpe_tcdm_data_out.be; + + // For the STRB, if the address is multiples of 8 ONLY, then we get lower 32 bits + // Otherwise we get upper 32 bits + assign strb = (unpack_addr[2]) ? 8'b1111_0000 : 8'b0000_1111; assign unpack_data = fifo_hwpe_tcdm_data_out.data; // This is necessary to include the empty. Since the FIFO does not clear its contents, @@ -137,6 +131,8 @@ module snax_hwpe_to_reqrsp #( typedef logic [31:0] fifo_addr_buffer_t; fifo_addr_buffer_t fifo_addr_out; + // This buffer is to hold the read address for + // word selection whether upper or lower 32-bits data fifo_v3 #( .dtype ( fifo_addr_buffer_t ), // Sum of address and .DEPTH ( 8 ) // Arbitrarily chosen @@ -177,17 +173,18 @@ module snax_hwpe_to_reqrsp #( // //--------------------------------------------- - assign hwpe_tcdm_slave.r_data = tcdm_rsp_i.p.data[31:0]; + // Select the appropriate word depending on address + assign hwpe_tcdm_slave.r_data = (fifo_addr_out[2]) ? tcdm_rsp_i.p.data[63:32] : tcdm_rsp_i.p.data[31:0]; assign hwpe_tcdm_slave.r_valid = tcdm_rsp_i.p_valid; //--------------------------------------------- // Some signals are unimportant so we tie them to 0 // Strb is just extended version of strb //--------------------------------------------- - assign tcdm_req_o.q.addr = {{31{1'b0}},unpack_addr}; - assign tcdm_req_o.q.data = {{31{1'b0}},unpack_data}; + assign tcdm_req_o.q.addr = {{32{1'b0}},unpack_addr}; + assign tcdm_req_o.q.data = (unpack_addr[2]) ? {unpack_data, {32{1'b0}}} : {{32{1'b0}},unpack_data}; assign tcdm_req_o.q.amo = AMONone; - assign tcdm_req_o.q.strb = '1; //Byte strobes are always valid + assign tcdm_req_o.q.strb = strb; assign tcdm_req_o.q.user = '0; // verilog_lint: waive-stop line-length diff --git a/hw/snax_hwpe_mac/src/snax_mac.sv b/hw/snax_hwpe_mac/src/snax_mac.sv index e2ccdaccb..4d3c63755 100644 --- a/hw/snax_hwpe_mac/src/snax_mac.sv +++ b/hw/snax_hwpe_mac/src/snax_mac.sv @@ -135,8 +135,7 @@ module snax_mac # ( snax_hwpe_to_reqrsp #( .DataWidth ( DataWidth ), // Data width to use .tcdm_req_t ( tcdm_req_t ), // TCDM request type - .tcdm_rsp_t ( tcdm_rsp_t ), // TCDM response type - .AlignOutputDouble( 1'b1 ) // Align data to double word + .tcdm_rsp_t ( tcdm_rsp_t ) // TCDM response type ) i_snax_hwpe_to_reqrsp ( .clk_i ( clk_i ), // Clock .rst_ni ( rst_ni ), // Asynchronous reset, active low diff --git a/target/snitch_cluster/sw/apps/snax-mac-simple/data/data.h b/target/snitch_cluster/sw/apps/snax-mac-simple/data/data.h index ffc352fa2..2e1bf6279 100644 --- a/target/snitch_cluster/sw/apps/snax-mac-simple/data/data.h +++ b/target/snitch_cluster/sw/apps/snax-mac-simple/data/data.h @@ -1,7 +1,7 @@ uint32_t VEC_LEN = 20; -uint64_t A[] = {99, 67, 39, 26, 62, 14, 17, 18, 54, 16, +uint32_t A[] = {99, 67, 39, 26, 62, 14, 17, 18, 54, 16, 44, 9, 26, 85, 72, 66, 95, 65, 43, 84}; -uint64_t B[] = {86, 10, 14, 11, 38, 41, 94, 82, 97, 25, +uint32_t B[] = {86, 10, 14, 11, 38, 41, 94, 82, 97, 25, 96, 71, 44, 59, 93, 38, 57, 21, 84, 29}; -uint64_t OUT[] = {8514, 670, 546, 286, 2356, 574, 1598, 1476, 5238, 400, +uint32_t OUT[] = {8514, 670, 546, 286, 2356, 574, 1598, 1476, 5238, 400, 4224, 639, 1144, 5015, 6696, 2508, 5415, 1365, 3612, 2436}; diff --git a/target/snitch_cluster/sw/apps/snax-mac-simple/src/snax-mac-simple.c b/target/snitch_cluster/sw/apps/snax-mac-simple/src/snax-mac-simple.c index 31133c5a1..59d7e2e72 100644 --- a/target/snitch_cluster/sw/apps/snax-mac-simple/src/snax-mac-simple.c +++ b/target/snitch_cluster/sw/apps/snax-mac-simple/src/snax-mac-simple.c @@ -10,11 +10,11 @@ int main() { // Set err value for checking int err = 0; - uint64_t *local_a, *local_b; - uint64_t *local_o; + uint32_t *local_a, *local_b; + uint32_t *local_o; // Allocate space in TCDM - local_a = (uint64_t *)snrt_l1_next(); + local_a = (uint32_t *)snrt_l1_next(); local_b = local_a + VEC_LEN; local_o = local_b + VEC_LEN; @@ -22,7 +22,7 @@ int main() { // Use data mover core to bring data from L3 to TCDM if (snrt_is_dm_core()) { - size_t vector_size = VEC_LEN * sizeof(uint64_t); + size_t vector_size = VEC_LEN * sizeof(uint32_t); snrt_dma_start_1d(local_a, A, vector_size); snrt_dma_start_1d(local_b, B, vector_size); } @@ -39,9 +39,9 @@ int main() { uint32_t csr_set = snrt_mcycle(); // Set addresses - write_csr(0x3d0, (uint64_t)local_a); - write_csr(0x3d1, (uint64_t)local_b); - write_csr(0x3d3, (uint64_t)local_o); + write_csr(0x3d0, (uint32_t)local_a); + write_csr(0x3d1, (uint32_t)local_b); + write_csr(0x3d3, (uint32_t)local_o); // Set configs write_csr(0x3d4, 1); // Number of iterations @@ -65,9 +65,9 @@ int main() { }; uint32_t mac_end = snrt_mcycle(); - uint64_t cpu_checker; + uint32_t cpu_checker; - for (uint64_t i = 0; i < (uint64_t)VEC_LEN; i++) { + for (uint32_t i = 0; i < (uint32_t)VEC_LEN; i++) { // Check if output is same as golden output if (*(local_o + i) != OUT[i]) { err++; diff --git a/target/snitch_cluster/sw/apps/snax-mac/data/data.h b/target/snitch_cluster/sw/apps/snax-mac/data/data.h index 6a3eee88c..276e4b6df 100644 --- a/target/snitch_cluster/sw/apps/snax-mac/data/data.h +++ b/target/snitch_cluster/sw/apps/snax-mac/data/data.h @@ -1,7 +1,7 @@ uint32_t VEC_LEN = 20; -uint64_t A[] = {99, 67, 39, 26, 62, 14, 17, 18, 54, 16, +uint32_t A[] = {99, 67, 39, 26, 62, 14, 17, 18, 54, 16, 44, 9, 26, 85, 72, 66, 95, 65, 43, 84}; -uint64_t B[] = {86, 10, 14, 11, 38, 41, 94, 82, 97, 25, +uint32_t B[] = {86, 10, 14, 11, 38, 41, 94, 82, 97, 25, 96, 71, 44, 59, 93, 38, 57, 21, 84, 29}; -uint64_t C = 51; -uint64_t OUT; \ No newline at end of file +uint32_t C = 51; +uint32_t OUT; diff --git a/target/snitch_cluster/sw/apps/snax-mac/src/snax-mac.c b/target/snitch_cluster/sw/apps/snax-mac/src/snax-mac.c index 8cd8c89c4..be48f71ef 100644 --- a/target/snitch_cluster/sw/apps/snax-mac/src/snax-mac.c +++ b/target/snitch_cluster/sw/apps/snax-mac/src/snax-mac.c @@ -10,12 +10,12 @@ int main() { // Set err value for checking int err = 0; - uint64_t final_output; + uint32_t final_output; - uint64_t *local_a, *local_b, *local_c, *local_o; + uint32_t *local_a, *local_b, *local_c, *local_o; // Allocate space in TCDM - local_a = (uint64_t *)snrt_l1_next(); + local_a = (uint32_t *)snrt_l1_next(); local_b = local_a + VEC_LEN; local_c = local_b + VEC_LEN; local_o = local_c + 1; @@ -24,8 +24,8 @@ int main() { // Use data mover core to bring data from L3 to TCDM if (snrt_is_dm_core()) { - size_t vector_size = VEC_LEN * sizeof(uint64_t); - size_t scale_size = 1 * sizeof(uint64_t); + size_t vector_size = VEC_LEN * sizeof(uint32_t); + size_t scale_size = 1 * sizeof(uint32_t); snrt_dma_start_1d(local_a, A, vector_size); snrt_dma_start_1d(local_b, B, vector_size); snrt_dma_start_1d(local_c, &C, scale_size); @@ -43,10 +43,10 @@ int main() { uint32_t csr_set = snrt_mcycle(); // Set addresses - write_csr(0x3d0, (uint64_t)local_a); - write_csr(0x3d1, (uint64_t)local_b); - write_csr(0x3d2, (uint64_t)local_c); - write_csr(0x3d3, (uint64_t)local_o); + write_csr(0x3d0, (uint32_t)local_a); + write_csr(0x3d1, (uint32_t)local_b); + write_csr(0x3d2, (uint32_t)local_c); + write_csr(0x3d3, (uint32_t)local_o); // Set configs write_csr(0x3d4, 1); // Number of iterations