diff --git a/hw/ip/snitch_cluster/src/snitch_cluster.sv b/hw/ip/snitch_cluster/src/snitch_cluster.sv index 679dc0f0..d3403a11 100644 --- a/hw/ip/snitch_cluster/src/snitch_cluster.sv +++ b/hw/ip/snitch_cluster/src/snitch_cluster.sv @@ -264,14 +264,14 @@ module snitch_cluster // Typedefs // -------- typedef logic [PhysicalAddrWidth-1:0] addr_t; - typedef logic [NarrowDataWidth-1:0] data_t; - typedef logic [NarrowDataWidth/8-1:0] strb_t; - typedef logic [WideDataWidth-1:0] data_dma_t; - typedef logic [WideDataWidth/8-1:0] strb_dma_t; - typedef logic [NarrowIdWidthIn-1:0] id_mst_t; - typedef logic [IdWidthOut-1:0] id_slv_t; - typedef logic [WideIdWidthIn-1:0] id_dma_mst_t; - typedef logic [IdWidthDMAOut-1:0] id_dma_slv_t; + typedef logic [NarrowDataWidth-1:0] data_t; + typedef logic [NarrowDataWidth/8-1:0] strb_t; + typedef logic [WideDataWidth-1:0] data_dma_t; + typedef logic [WideDataWidth/8-1:0] strb_dma_t; + typedef logic [NarrowIdWidthIn-1:0] id_mst_t; + typedef logic [IdWidthOut-1:0] id_slv_t; + typedef logic [WideIdWidthIn-1:0] id_dma_mst_t; + typedef logic [IdWidthDMAOut-1:0] id_dma_slv_t; typedef logic [UserWidth-1:0] user_t; typedef logic [TCDMAddrWidth-1:0] tcdm_addr_t; diff --git a/hw/ip/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/ip/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl index aa2232c5..038ddce0 100644 --- a/hw/ip/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl +++ b/hw/ip/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl @@ -46,7 +46,7 @@ package ${cfg['pkg_name']}; localparam int unsigned NrMasters = 3 + ${cfg['nr_hives']}; localparam int unsigned NarrowIdWidthOut = $clog2(NrMasters) + NarrowIdWidthIn; - localparam int unsigned NrDmaMasters = 1; + localparam int unsigned NrDmaMasters = 2; localparam int unsigned WideIdWidthIn = ${cfg['dma_id_width_in']}; localparam int unsigned WideIdWidthOut = $clog2(NrDmaMasters) + WideIdWidthIn; diff --git a/hw/ip/snitch_dma/Bender.yml b/hw/ip/snitch_dma/Bender.yml index 62e93517..a9f76a2c 100644 --- a/hw/ip/snitch_dma/Bender.yml +++ b/hw/ip/snitch_dma/Bender.yml @@ -18,6 +18,5 @@ sources: - src/axi_dma_error_handler.sv - src/axi_dma_perf_counters.sv - src/axi_dma_twod_ext.sv -- src/axi_dma_tc_snitch_fe_id_gen.sv # Level 2: - src/axi_dma_tc_snitch_fe.sv diff --git a/hw/ip/snitch_dma/src/axi_dma_error_handler.sv b/hw/ip/snitch_dma/src/axi_dma_error_handler.sv index 37a72803..5a9fab2e 100644 --- a/hw/ip/snitch_dma/src/axi_dma_error_handler.sv +++ b/hw/ip/snitch_dma/src/axi_dma_error_handler.sv @@ -5,7 +5,7 @@ // Thomas Benz // Sample implementation to report errors from the AXI bus. -// This module provides the adress of errors on a handshacked interface +// This module provides the address of errors on a handshaked interface module axi_dma_error_handler #( parameter int unsigned ADDR_WIDTH = -1, @@ -193,4 +193,4 @@ module axi_dma_error_handler #( end -endmodule : axi_dma_error_handler +endmodule diff --git a/hw/ip/snitch_dma/src/axi_dma_perf_counters.sv b/hw/ip/snitch_dma/src/axi_dma_perf_counters.sv index 8eed0cc0..dd5b32b1 100644 --- a/hw/ip/snitch_dma/src/axi_dma_perf_counters.sv +++ b/hw/ip/snitch_dma/src/axi_dma_perf_counters.sv @@ -5,7 +5,6 @@ // Thomas Benz // Sample implementation of performance counters. - module axi_dma_perf_counters #( parameter int unsigned TRANSFER_ID_WIDTH = -1, parameter int unsigned DATA_WIDTH = -1, @@ -148,14 +147,7 @@ module axi_dma_perf_counters #( if (dma_busy_i) dma_perf_d.dma_busy_cnt = dma_perf_q.dma_busy_cnt + 'h1; end - always_ff @(posedge clk_i or negedge rst_ni) begin : proc_counter - if(~rst_ni) begin - dma_perf_q <= '0; - end else begin - dma_perf_q <= dma_perf_d; - end - end - + `FF(dma_perf_q, dma_perf_d, 0); assign dma_perf_o = dma_perf_q; endmodule diff --git a/hw/ip/snitch_dma/src/axi_dma_tc_snitch_fe.sv b/hw/ip/snitch_dma/src/axi_dma_tc_snitch_fe.sv index 66968aa6..1ee35d20 100644 --- a/hw/ip/snitch_dma/src/axi_dma_tc_snitch_fe.sv +++ b/hw/ip/snitch_dma/src/axi_dma_tc_snitch_fe.sv @@ -7,6 +7,8 @@ // Implements the tightly-coupled frontend. This module can directly be connected // to an accelerator bus in the snitch system +`include "common_cells/registers.svh" + module axi_dma_tc_snitch_fe #( parameter int unsigned AddrWidth = 0, parameter int unsigned DataWidth = 0, @@ -76,12 +78,11 @@ module axi_dma_tc_snitch_fe #( //-------------------------------------- // Backend Instanciation //-------------------------------------- - logic backend_idle; - logic trans_complete; + logic backend_idle; burst_req_t burst_req; - logic burst_req_valid; - logic burst_req_ready; - logic oned_trans_complete; + logic burst_req_valid; + logic burst_req_ready; + logic oned_trans_complete; axi_dma_backend #( .DataWidth ( DMADataWidth ), @@ -162,16 +163,8 @@ module axi_dma_tc_snitch_fe #( logic [31:0] next_id; logic [31:0] completed_id; - axi_dma_tc_snitch_fe_id_gen #( - .ID_WIDTH ( 32 ) - ) i_axi_dma_tc_snitch_fe_id_gen ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .issue_i ( twod_req_valid && twod_req_ready ), - .retire_i ( oned_trans_complete && twod_req_last_realigned ), - .next_o ( next_id ), - .completed_o ( completed_id ) - ); + `FFL(next_id, next_id + 'h1, twod_req_valid & twod_req_ready, 0) + `FFL(completed_id, completed_id + 'h1, oned_trans_complete & twod_req_last_realigned, 0) // dma is busy when it is not idle assign dma_busy_o = next_id != completed_id; @@ -230,6 +223,8 @@ module axi_dma_tc_snitch_fe #( twod_req_d = twod_req_q; twod_req_d.burst_src = axi_pkg::BURST_INCR; twod_req_d.burst_dst = axi_pkg::BURST_INCR; + twod_req_d.cache_src = axi_pkg::CACHE_MODIFIABLE; + twod_req_d.cache_dst = axi_pkg::CACHE_MODIFIABLE; twod_req_valid = 1'b0; acc_qready_o = 1'b0; acc_pdata_spill = '0; @@ -241,125 +236,121 @@ module axi_dma_tc_snitch_fe #( dma_op_name = "Invalid"; // decode - if (acc_qvalid_i == 1'b1) unique casez (acc_qdata_op_i) - - // manipulate the source register - riscv_instr::DMSRC : begin - twod_req_d.src[31: 0] = acc_qdata_arga_i[31:0]; - twod_req_d.src[AddrWidth-1:32] = acc_qdata_argb_i[AddrWidth-1-32: 0]; - acc_qready_o = 1'b1; - is_dma_op = 1'b1; - dma_op_name = "DMSRC"; - end - - // manipulate the destination register - riscv_instr::DMDST : begin - twod_req_d.dst[31: 0] = acc_qdata_arga_i[31:0]; - twod_req_d.dst[AddrWidth-1:32] = acc_qdata_argb_i[AddrWidth-1-32: 0]; - acc_qready_o = 1'b1; - is_dma_op = 1'b1; - dma_op_name = "DMDST"; - end - - // start the DMA - riscv_instr::DMCPYI, - riscv_instr::DMCPY : begin - automatic logic [1:0] cfg; - - // Parse the transfer parameters from the register or immediate. - unique casez (acc_qdata_op_i) - riscv_instr::DMCPYI : cfg = acc_qdata_op_i[24:20]; - riscv_instr::DMCPY : cfg = acc_qdata_argb_i; - default:; - endcase - dma_op_name = "DMCPY"; - is_dma_op = 1'b1; - - twod_req_d.num_bytes = acc_qdata_arga_i; - twod_req_d.decouple_rw = cfg[0]; - twod_req_d.is_twod = cfg[1]; - - // Perform the following sequence: - // 1. wait for acc response channel to be ready (pready) - // 2. request twod transfer (valid) - // 3. wait for twod transfer to be accepted (ready) - // 4. send acc response (pvalid) - // 5. acknowledge acc request (qready) - if (acc_pready_spill) begin - twod_req_valid = 1'b1; - if (twod_req_ready) begin - acc_pdata_spill.id = acc_qid_i; - acc_pdata_spill.data = next_id; - acc_pdata_spill.error = 1'b0; - acc_pvalid_spill = 1'b1; - acc_qready_o = twod_req_ready; - end - end - end - - // status of the DMA - riscv_instr::DMSTATI, - riscv_instr::DMSTAT : begin - automatic logic [1:0] status; - - // Parse the status index from the register or immediate. - unique casez (acc_qdata_op_i) - riscv_instr::DMSTATI : status = acc_qdata_op_i[24:20]; - riscv_instr::DMSTAT : status = acc_qdata_argb_i; - default:; - endcase - dma_op_name = "DMSTAT"; - is_dma_op = 1'b1; - - // Compose the response. - acc_pdata_spill.id = acc_qid_i; - acc_pdata_spill.error = 1'b0; - case (status) - 2'b00 : acc_pdata_spill.data = completed_id; - 2'b01 : acc_pdata_spill.data = next_id; - 2'b10 : acc_pdata_spill.data = {{{8'd63}{1'b0}}, dma_busy_o}; - 2'b11 : acc_pdata_spill.data = {{{8'd63}{1'b0}}, !twod_req_ready}; - default:; - endcase - - // Wait for acc response channel to become ready, then ack the - // request. - if (acc_pready_spill) begin - acc_pvalid_spill = 1'b1; - acc_qready_o = 1'b1; - end - end - - // manipulate the strides - riscv_instr::DMSTR : begin - twod_req_d.stride_src = acc_qdata_arga_i; - twod_req_d.stride_dst = acc_qdata_argb_i; - acc_qready_o = 1'b1; - is_dma_op = 1'b1; - dma_op_name = "DMSTR"; - end - - // manipulate the strides - riscv_instr::DMREP : begin - twod_req_d.num_repetitions = acc_qdata_arga_i; - acc_qready_o = 1'b1; - is_dma_op = 1'b1; - dma_op_name = "DMREP"; - end - - default:; - endcase + if (acc_qvalid_i == 1'b1) begin + unique casez (acc_qdata_op_i) + + // manipulate the source register + riscv_instr::DMSRC : begin + twod_req_d.src[31: 0] = acc_qdata_arga_i[31:0]; + twod_req_d.src[AddrWidth-1:32] = acc_qdata_argb_i[AddrWidth-1-32: 0]; + acc_qready_o = 1'b1; + is_dma_op = 1'b1; + dma_op_name = "DMSRC"; + end + + // manipulate the destination register + riscv_instr::DMDST : begin + twod_req_d.dst[31: 0] = acc_qdata_arga_i[31:0]; + twod_req_d.dst[AddrWidth-1:32] = acc_qdata_argb_i[AddrWidth-1-32: 0]; + acc_qready_o = 1'b1; + is_dma_op = 1'b1; + dma_op_name = "DMDST"; + end + + // start the DMA + riscv_instr::DMCPYI, + riscv_instr::DMCPY : begin + automatic logic [1:0] cfg; + + // Parse the transfer parameters from the register or immediate. + unique casez (acc_qdata_op_i) + riscv_instr::DMCPYI : cfg = acc_qdata_op_i[24:20]; + riscv_instr::DMCPY : cfg = acc_qdata_argb_i; + default:; + endcase + dma_op_name = "DMCPY"; + is_dma_op = 1'b1; + + twod_req_d.num_bytes = acc_qdata_arga_i; + twod_req_d.decouple_rw = cfg[0]; + twod_req_d.is_twod = cfg[1]; + + // Perform the following sequence: + // 1. wait for acc response channel to be ready (pready) + // 2. request twod transfer (valid) + // 3. wait for twod transfer to be accepted (ready) + // 4. send acc response (pvalid) + // 5. acknowledge acc request (qready) + if (acc_pready_spill) begin + twod_req_valid = 1'b1; + if (twod_req_ready) begin + acc_pdata_spill.id = acc_qid_i; + acc_pdata_spill.data = next_id; + acc_pdata_spill.error = 1'b0; + acc_pvalid_spill = 1'b1; + acc_qready_o = twod_req_ready; + end + end + end + + // status of the DMA + riscv_instr::DMSTATI, + riscv_instr::DMSTAT: begin + automatic logic [1:0] status; + + // Parse the status index from the register or immediate. + unique casez (acc_qdata_op_i) + riscv_instr::DMSTATI: status = acc_qdata_op_i[24:20]; + riscv_instr::DMSTAT: status = acc_qdata_argb_i; + default:; + endcase + dma_op_name = "DMSTAT"; + is_dma_op = 1'b1; + + // Compose the response. + acc_pdata_spill.id = acc_qid_i; + acc_pdata_spill.error = 1'b0; + case (status) + 2'b00 : acc_pdata_spill.data = completed_id; + 2'b01 : acc_pdata_spill.data = next_id; + 2'b10 : acc_pdata_spill.data = {{{8'd63}{1'b0}}, dma_busy_o}; + 2'b11 : acc_pdata_spill.data = {{{8'd63}{1'b0}}, !twod_req_ready}; + default:; + endcase + + // Wait for acc response channel to become ready, then ack the + // request. + if (acc_pready_spill) begin + acc_pvalid_spill = 1'b1; + acc_qready_o = 1'b1; + end + end + + // manipulate the strides + riscv_instr::DMSTR : begin + twod_req_d.stride_src = acc_qdata_arga_i; + twod_req_d.stride_dst = acc_qdata_argb_i; + acc_qready_o = 1'b1; + is_dma_op = 1'b1; + dma_op_name = "DMSTR"; + end + + // manipulate the strides + riscv_instr::DMREP : begin + twod_req_d.num_repetitions = acc_qdata_arga_i; + acc_qready_o = 1'b1; + is_dma_op = 1'b1; + dma_op_name = "DMREP"; + end + + default:; + endcase + end end //-------------------------------------- // State //-------------------------------------- - always_ff @(posedge clk_i or negedge rst_ni) begin : proc_modifiable_request - if(!rst_ni) begin - twod_req_q <= '0; - end else begin - twod_req_q <= twod_req_d; - end - end + `FF(twod_req_q, twod_req_d, '0) -endmodule : axi_dma_tc_snitch_fe +endmodule diff --git a/hw/ip/snitch_dma/src/axi_dma_tc_snitch_fe_id_gen.sv b/hw/ip/snitch_dma/src/axi_dma_tc_snitch_fe_id_gen.sv deleted file mode 100644 index 4c6ccd3e..00000000 --- a/hw/ip/snitch_dma/src/axi_dma_tc_snitch_fe_id_gen.sv +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -// Thomas Benz - -// simple example of a transaction id generator. just increase the transaction id on every request - -module axi_dma_tc_snitch_fe_id_gen #( - parameter int unsigned ID_WIDTH = -1 -) ( - input logic clk_i, - input logic rst_ni, - // new request is pushed - input logic issue_i, - // request is popped - input logic retire_i, - // next id is - output logic [ID_WIDTH-1:0] next_o, - // last id completed is - output logic [ID_WIDTH-1:0] completed_o -); - - //-------------------------------------- - // counters - //-------------------------------------- - logic [ID_WIDTH-1:0] next_d, next_q; - logic [ID_WIDTH-1:0] completed_d, completed_q; - - // count up on events - assign next_d = (issue_i == 1'b1) ? next_q + 'h1 : next_q; - assign completed_d = (retire_i == 1'b1) ? completed_q + 'h1 : completed_q; - - // assign outputs - assign next_o = next_q; - assign completed_o = completed_q; - - //-------------------------------------- - // state - //-------------------------------------- - always_ff @(posedge clk_i or negedge rst_ni) begin : proc_id_gen - if(~rst_ni) begin - next_q <= 1; - completed_q <= 0; - end else begin - next_q <= next_d; - completed_q <= completed_d; - end - end - -endmodule : axi_dma_tc_snitch_fe_id_gen - diff --git a/hw/ip/snitch_dma/src/axi_dma_twod_ext.sv b/hw/ip/snitch_dma/src/axi_dma_twod_ext.sv index 533f90cc..22e6d20f 100644 --- a/hw/ip/snitch_dma/src/axi_dma_twod_ext.sv +++ b/hw/ip/snitch_dma/src/axi_dma_twod_ext.sv @@ -4,8 +4,7 @@ // Thomas Benz -// Accept 2D requests and flatten them to 1D requests - +/// Accept 2D requests and flatten them to 1D requests. module axi_dma_twod_ext #( parameter int unsigned ADDR_WIDTH = -1, parameter int unsigned REQ_FIFO_DEPTH = -1, @@ -14,15 +13,15 @@ module axi_dma_twod_ext #( ) ( input logic clk_i, input logic rst_ni, - // arbitrary burst request + /// Arbitrary burst request output burst_req_t burst_req_o, output logic burst_req_valid_o, input logic burst_req_ready_i, - // 2D request request + /// 2D Request input twod_req_t twod_req_i, input logic twod_req_valid_i, output logic twod_req_ready_o, - // 2D request completed + /// 2D Request Completed output logic twod_req_last_o ); @@ -66,7 +65,7 @@ module axi_dma_twod_ext #( logic [ADDR_WIDTH-1:0] dst_address_d, dst_address_q; //-------------------------------------- - // 2D extension + // 2D Extension //-------------------------------------- always_comb begin : proc_twod_ext // defaults @@ -75,13 +74,13 @@ module axi_dma_twod_ext #( burst_req_valid_o = 1'b0; twod_req_last_o = 1'b0; - // conter keeps its value + // counter keeps its value num_bursts_d = num_bursts_q; src_address_d = src_address_q; dst_address_d = dst_address_q; //-------------------------------------- - // 1D case + // 1D Case //-------------------------------------- // in the case that we have a 1D transfer, hand the transfer out if (!twod_req_current.is_twod) begin @@ -103,7 +102,7 @@ module axi_dma_twod_ext #( twod_req_last_o = 1'b1; //-------------------------------------- - // 2D case - counter management + // 2D Case - Counter Management //-------------------------------------- // in the 2D case: we need to work with a counter end else begin @@ -157,18 +156,10 @@ module axi_dma_twod_ext #( end //-------------------------------------- - // Update Conters + // Update Counters //-------------------------------------- - always_ff @(posedge clk_i or negedge rst_ni) begin : proc_update_counters - if(!rst_ni) begin - num_bursts_q <= '0; - src_address_q <= '0; - dst_address_q <= '0; - end else begin - num_bursts_q <= num_bursts_d; - src_address_q <= src_address_d; - dst_address_q <= dst_address_d; - end - end + `FF(num_bursts_q, num_bursts_d, '0) + `FF(src_address_q, src_address_d, '0) + `FF(dst_address_q, dst_address_d, '0) -endmodule : axi_dma_twod_ext +endmodule diff --git a/hw/system/snitch_cluster/test/testharness.sv b/hw/system/snitch_cluster/test/testharness.sv index 481507fd..62e83433 100644 --- a/hw/system/snitch_cluster/test/testharness.sv +++ b/hw/system/snitch_cluster/test/testharness.sv @@ -39,12 +39,11 @@ module testharness import snitch_cluster_pkg::*; ( .wide_in_resp_o (wide_in_resp) ); - // Tie-off unused ports. + // Tie-off unused input ports. assign narrow_in_req = '0; - assign wide_out_resp = '0; assign wide_in_req = '0; - // Simulation memory. + // Narrow port into simulation memory. tb_memory #( .AxiAddrWidth (AddrWidth), .AxiDataWidth (NarrowDataWidth), @@ -59,4 +58,19 @@ module testharness import snitch_cluster_pkg::*; ( .rsp_o (narrow_out_resp) ); + // Wide port into simulation memory. + tb_memory #( + .AxiAddrWidth (AddrWidth), + .AxiDataWidth (WideDataWidth), + .AxiIdWidth (WideIdWidthOut), + .AxiUserWidth (UserWidth), + .req_t (wide_out_req_t), + .rsp_t (wide_out_resp_t) + ) i_dma ( + .clk_i, + .rst_ni, + .req_i (wide_out_req), + .rsp_o (wide_out_resp) + ); + endmodule diff --git a/sw/cmake/toolchain-gcc.cmake b/sw/cmake/toolchain-gcc.cmake index 3af75fe4..79235cf5 100644 --- a/sw/cmake/toolchain-gcc.cmake +++ b/sw/cmake/toolchain-gcc.cmake @@ -5,7 +5,7 @@ set(CMAKE_OBJDUMP riscv32-unknown-elf-objdump) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION true) add_compile_options(-march=rv32imafd -mabi=ilp32d -mcmodel=medany -mno-fdiv -ffast-math -fno-builtin-printf -fno-common) -add_link_options(-march=rv32imafd -mabi=ilp32d -nostdlib -Wl,-Ttext-segment=0x80000000) +add_link_options(-march=rv32imafd -mabi=ilp32d -nostartfiles -Wl,-Ttext-segment=0x80000000) # add_link_options(-Wl,--verbose) link_libraries(-lm -lgcc) diff --git a/sw/snRuntime/CMakeLists.txt b/sw/snRuntime/CMakeLists.txt index 6ef9dc27..75dbbf07 100644 --- a/sw/snRuntime/CMakeLists.txt +++ b/sw/snRuntime/CMakeLists.txt @@ -24,9 +24,10 @@ include_directories(include) # Common sources set(sources src/barrier.c - src/team.c - src/printf.c + src/dma.c src/memcpy.c + src/printf.c + src/team.c ) # Generic Runtime (requiring bootloader/OS) @@ -49,8 +50,10 @@ add_snitch_test(varargs_1 tests/varargs_1.c) add_snitch_test(varargs_2 tests/varargs_2.c) add_snitch_test(barrier tests/barrier.c) +add_snitch_test_executable(dma_simple tests/dma_simple.c) add_snitch_test_executable(printf_simple tests/printf_simple.c) add_snitch_test_executable(printf_fmtint tests/printf_fmtint.c) +add_snitch_test_rtl(dma_simple) add_snitch_test_rtl(printf_simple) add_snitch_test_rtl(printf_fmtint) add_snitch_test_args(printf_simple printf_simple --no-opt-llvm --base-hartid=3) diff --git a/sw/snRuntime/include/snrt.h b/sw/snRuntime/include/snrt.h index ec75f5a7..9de1401b 100644 --- a/sw/snRuntime/include/snrt.h +++ b/sw/snRuntime/include/snrt.h @@ -47,3 +47,26 @@ extern void snrt_bcast_send(void *data, size_t len); extern void snrt_bcast_recv(void *data, size_t len); extern void *snrt_memcpy(void *dst, const void *src, size_t n); + +/// DMA runtime functions. +/// A DMA transfer identifier. +typedef uint32_t snrt_dma_txid_t; +/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers. +extern snrt_dma_txid_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, + size_t size); +/// Initiate an asynchronous 1D DMA transfer. +extern snrt_dma_txid_t snrt_dma_start_1d(void *dst, const void *src, + size_t size); +/// Initiate an asynchronous 2D DMA transfer with wide 64-bit pointers. +extern snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src, + size_t size, size_t dst_stride, + size_t src_stride, + size_t repeat); +/// Initiate an asynchronous 2D DMA transfer. +extern snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src, + size_t size, size_t src_stride, + size_t dst_stride, size_t repeat); +/// Block until a transfer finishes. +extern void snrt_dma_wait(snrt_dma_txid_t tid); +/// Block until all operation on the DMA ceases. +extern void snrt_dma_wait_all(); diff --git a/sw/snRuntime/src/dma.c b/sw/snRuntime/src/dma.c new file mode 100644 index 00000000..b0d9ad08 --- /dev/null +++ b/sw/snRuntime/src/dma.c @@ -0,0 +1,153 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +#include + +/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers. +snrt_dma_txid_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, + size_t size) { + register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 + register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11 + register uint32_t reg_src_low asm("a2") = src >> 0; // 12 + register uint32_t reg_src_high asm("a3") = src >> 32; // 13 + register uint32_t reg_size asm("a4") = size; // 14 + + // dmsrc a0, a1 + asm volatile( + ".word (0b0000000 << 25) | \ + ( (13) << 20) | \ + ( (12) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" ::"r"(reg_src_high), + "r"(reg_src_low)); + + // dmdst a0, a1 + asm volatile( + ".word (0b0000001 << 25) | \ + ( (11) << 20) | \ + ( (10) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" ::"r"(reg_dst_high), + "r"(reg_dst_low)); + + // dmcpyi a0, a4, 0b00 + register uint32_t reg_txid asm("a0"); // 10 + asm volatile( + ".word (0b0000010 << 25) | \ + ( 0b00000 << 20) | \ + ( (14) << 15) | \ + ( 0b000 << 12) | \ + ( (10) << 7) | \ + (0b0101011 << 0) \n" + : "=r"(reg_txid) + : "r"(reg_size)); + + return reg_txid; +} + +/// Initiate an asynchronous 1D DMA transfer. +snrt_dma_txid_t snrt_dma_start_1d(void *dst, const void *src, size_t size) { + return snrt_dma_start_1d_wideptr((size_t)dst, (size_t)src, size); +} + +/// Initiate an asynchronous 2D DMA transfer with wide 64-bit pointers. +snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src, + size_t size, size_t dst_stride, + size_t src_stride, size_t repeat) { + register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 + register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11 + register uint32_t reg_src_low asm("a2") = src >> 0; // 12 + register uint32_t reg_src_high asm("a3") = src >> 32; // 13 + register uint32_t reg_size asm("a4") = size; // 14 + register uint32_t reg_dst_stride asm("a5") = dst_stride; // 15 + register uint32_t reg_src_stride asm("a6") = src_stride; // 16 + register uint32_t reg_repeat asm("a7") = repeat; // 17 + + // dmsrc a0, a1 + asm volatile( + ".word (0b0000000 << 25) | \ + ( (13) << 20) | \ + ( (12) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" ::"r"(reg_src_high), + "r"(reg_src_low)); + + // dmdst a0, a1 + asm volatile( + ".word (0b0000001 << 25) | \ + ( (11) << 20) | \ + ( (10) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" ::"r"(reg_dst_high), + "r"(reg_dst_low)); + + // dmstr a5, a6 + asm volatile( + ".word (0b0000110 << 25) | \ + ( (15) << 20) | \ + ( (16) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" + : + : "r"(reg_dst_stride), "r"(reg_src_stride)); + + // dmrep a7 + asm volatile( + ".word (0b0000111 << 25) | \ + ( (17) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" + : + : "r"(reg_repeat)); + + // dmcpyi a0, a4, 0b10 + register uint32_t reg_txid asm("a0"); // 10 + asm volatile( + ".word (0b0000010 << 25) | \ + ( 0b00010 << 20) | \ + ( (14) << 15) | \ + ( 0b000 << 12) | \ + ( (10) << 7) | \ + (0b0101011 << 0) \n" + : "=r"(reg_txid) + : "r"(reg_size)); + + return reg_txid; +} + +/// Initiate an asynchronous 2D DMA transfer. +snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src, size_t size, + size_t src_stride, size_t dst_stride, + size_t repeat) { + return snrt_dma_start_2d_wideptr((size_t)dst, (size_t)src, size, src_stride, + dst_stride, repeat); +} + +/// Block until a transfer finishes. +void snrt_dma_wait(snrt_dma_txid_t tid) { + // dmstati t0, 0 # 2=status.completed_id + asm volatile( + "1: \n" + ".word (0b0000100 << 25) | \ + ( 0b00000 << 20) | \ + ( 0b000 << 12) | \ + ( (5) << 7) | \ + (0b0101011 << 0) \n" + "sub t0, t0, %0 \n" + "blez t0, 1b \n" ::"r"(tid) + : "t0"); +} + +/// Block until all operation on the DMA ceases. +void snrt_dma_wait_all() { + // dmstati t0, 2 # 2=status.busy + asm volatile( + "1: \n" + ".word (0b0000100 << 25) | \ + ( 0b00010 << 20) | \ + ( 0b000 << 12) | \ + ( (5) << 7) | \ + (0b0101011 << 0) \n" + "bne t0, zero, 1b \n" :: + : "t0"); +} diff --git a/sw/snRuntime/src/memcpy.c b/sw/snRuntime/src/memcpy.c index 21f6bd4b..a00ebd4e 100644 --- a/sw/snRuntime/src/memcpy.c +++ b/sw/snRuntime/src/memcpy.c @@ -2,32 +2,9 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -// A simple implementation of memcpy. +#include #include "snrt.h" -void *snrt_memcpy(void *dst, const void *src, size_t n) { - const void *end = dst + n; - - size_t dst_align = (size_t)dst % sizeof(size_t); - size_t src_align = (size_t)src % sizeof(size_t); - - if (dst_align == src_align) { - size_t a = dst_align; - while (a++ != 0 && dst != end) { - *(char *)dst++ = *(char *)src++; - } - while (dst + sizeof(size_t) <= end) { - *(size_t *)dst++ = *(size_t *)src++; - } - while (dst != end) { - *(char *)dst++ = *(char *)src++; - } - } else { - while (dst != end) { - *(char *)dst++ = *(char *)src++; - } - } - - return dst; -} +// TODO: Implement using cluster DMA for a faster `memcpy`. +void *snrt_memcpy(void *dst, const void *src, size_t n) { memcpy(dst, src, n); } diff --git a/sw/snRuntime/tests/dma_simple.c b/sw/snRuntime/tests/dma_simple.c new file mode 100644 index 00000000..5260236e --- /dev/null +++ b/sw/snRuntime/tests/dma_simple.c @@ -0,0 +1,42 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include + +// Allocate a buffer in the main memory which we will use to copy data around +// with the DMA. +uint32_t buffer[32]; + +int main() { + if (snrt_global_core_idx() != 8) return 0; // only DMA core + uint32_t errors = 0; + + // Populate buffers. + uint32_t buffer_src[32], buffer_dst[32]; + for (uint32_t i = 0; i < 32; i++) { + buffer[i] = 0xAAAAAAAA; + buffer_dst[i] = 0x55555555; + buffer_src[i] = i + 1; + } + + // Copy data to main memory. + snrt_dma_start_1d(buffer, buffer_src, sizeof(buffer)); + snrt_dma_wait_all(); + + // Check that the main memory buffer contains the correct data. + for (uint32_t i = 0; i < 32; i++) { + errors += (buffer[i] != buffer_src[i]); + } + + // Copy data to L1. + snrt_dma_start_1d(buffer_dst, buffer, sizeof(buffer)); + snrt_dma_wait_all(); + + // Check that the L1 buffer contains the correct data. + for (uint32_t i = 0; i < 32; i++) { + errors += (buffer_dst[i] != buffer_src[i]); + } + + return errors; +}