diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 721828ba7..a8e3612e2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -318,31 +318,32 @@ jobs: sw/runtime.yaml \ sw/snax-streamer-gemm-conv-simd-run.yaml -j - snax-xdma-unittest: - name: Run several unit tests for xdma + snax-xdma-vlt-generic: + name: Simulate SW on xdma w/ Verilator (Generic LLVM) runs-on: ubuntu-22.04 container: image: ghcr.io/kuleuven-micas/snax:main steps: - uses: actions/checkout@v2 with: - submodules: 'recursive' - - name: Test xdma streamer - working-directory: hw/chisel - run: |- - sbt "testOnly snax.xdma.xdmaStreamer.*" - - name: Test xdma DataPath and Controller - working-directory: hw/chisel - run: |- - sbt "testOnly snax.xdma.xdmaFrontend.*" - - name: Test xdma Extension Framework + MaxPool - working-directory: hw/chisel - run: |- - sbt "testOnly snax.xdma.xdmaExtension.*" - - name: Test xdma Top Module - working-directory: hw/chisel + submodules: "recursive" + - name: Build Hardware + run: | + make CFG_OVERRIDE=cfg/snax-streamer-gemmX-xdma.hjson \ + -C target/snitch_cluster bin/snitch_cluster.vlt -j$(nproc) + - name: Build Software + run: | + make -C target/snitch_cluster sw \ + CFG_OVERRIDE=cfg/snax-streamer-gemmX-xdma.hjson \ + SELECT_RUNTIME=rtl-generic \ + SELECT_TOOLCHAIN=llvm-generic + - name: Run Tests + working-directory: target/snitch_cluster run: |- - sbt "testOnly snax.xdma.xdmaTop.*" + ./run.py --simulator verilator \ + sw/runtime.yaml \ + sw/snax-streamer-gemm-conv-simd-run.yaml \ + sw/snax-xdma-run.yaml -j ############################################ # Build SW on Snitch Cluster w/ Banshee # diff --git a/.github/workflows/gen.yml b/.github/workflows/gen.yml deleted file mode 100644 index 3b1125c40..000000000 --- a/.github/workflows/gen.yml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2024 KU Leuven. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -# Run functional regression checks -name: Template and Chisel Generation -on: - push: - branches: ["main"] - pull_request: -jobs: - - ##################### - # Generation Checks # - ##################### - gen_checks: - name: Generation checks - runs-on: ubuntu-latest - container: - image: ghcr.io/kuleuven-micas/snax:main - steps: - - uses: actions/checkout@v2 - - name: Check wrapper, parameter, and chisel generation - run: | - ./util/wrappergen/wrappergen.py \ - --cfg_path="./target/snitch_cluster/cfg/snax-alu.hjson" \ - --tpl_path="./hw/templates/" \ - --chisel_path="./hw/chisel" \ - --gen_path="./target/snitch_cluster/generated/" diff --git a/.github/workflows/scala-unit-test.yml b/.github/workflows/scala-unit-test.yml index ee83bac9a..f7033a982 100644 --- a/.github/workflows/scala-unit-test.yml +++ b/.github/workflows/scala-unit-test.yml @@ -23,3 +23,19 @@ jobs: working-directory: hw/chisel run: | mill Snax.test + - name: Test xdma streamer + working-directory: hw/chisel + run: |- + sbt "testOnly snax.xdma.xdmaStreamer.*" + - name: Test xdma DataPath and Controller + working-directory: hw/chisel + run: |- + sbt "testOnly snax.xdma.xdmaFrontend.*" + - name: Test xdma Extension Framework + MaxPool + working-directory: hw/chisel + run: |- + sbt "testOnly snax.xdma.xdmaExtension.*" + - name: Test xdma Top Module + working-directory: hw/chisel + run: |- + sbt "testOnly snax.xdma.xdmaTop.*" diff --git a/Bender.yml b/Bender.yml index 5fd072744..09eeb9172 100644 --- a/Bender.yml +++ b/Bender.yml @@ -371,6 +371,26 @@ sources: # Level 2 - target/snitch_cluster/generated/snax_streamer_gemmX/snax_streamer_gemmX_wrapper.sv + - target: snax_streamer_gemmX_xdma + files: + # Level 0 + - hw/chisel_acc/generated/gemmx/BlockGemmRescaleSIMD.sv + - hw/chisel_acc/src/snax_streamer_gemmX_shell_wrapper.sv + - target/snitch_cluster/generated/snax_streamer_gemmX/snax_streamer_gemmX_csrman_CsrManager.sv + - target/snitch_cluster/generated/snax_streamer_gemmX/snax_streamer_gemmX_streamer_StreamerTop.sv + # Level 1 + - target/snitch_cluster/generated/snax_streamer_gemmX/snax_streamer_gemmX_csrman_wrapper.sv + - target/snitch_cluster/generated/snax_streamer_gemmX/snax_streamer_gemmX_streamer_wrapper.sv + # Level 2 + - target/snitch_cluster/generated/snax_streamer_gemmX/snax_streamer_gemmX_wrapper.sv + + # xdma source + # Level 0 + - target/snitch_cluster/generated/snax_streamer_gemmX_xdma_cluster_xdma/snax_streamer_gemmX_xdma_cluster_xdma.sv + # Level 1 + - target/snitch_cluster/generated/snax_streamer_gemmX_xdma_cluster_xdma/snax_streamer_gemmX_xdma_cluster_xdma_wrapper.sv + + - target: test files: - hw/snitch_cluster/test/snitch_tcdm_interconnect_tb.sv @@ -416,6 +436,9 @@ sources: - target: snax_streamer_gemmX files: - target/snitch_cluster/generated/snax_streamer_gemmX_cluster_wrapper.sv + - target: snax_streamer_gemmX_xdma + files: + - target/snitch_cluster/generated/snax_streamer_gemmX_xdma_cluster_wrapper.sv - target: snax_streamer_gemm_add_c files: - target/snitch_cluster/generated/snax_streamer_gemm_add_c_cluster_wrapper.sv diff --git a/hw/chisel/src/main/scala/snax/utils/ArgParser.scala b/hw/chisel/src/main/scala/snax/utils/ArgParser.scala new file mode 100644 index 000000000..7aef89e59 --- /dev/null +++ b/hw/chisel/src/main/scala/snax/utils/ArgParser.scala @@ -0,0 +1,32 @@ +package snax.utils + +object ArgParser { + + /* + * Function to parse the arguments provided to the program + * Arguments are expected to be in the form of --arg_name arg_value or --arg_name + * Returns a map of argument names to their values + */ + def parse(args: Array[String]): collection.mutable.Map[String, String] = { + val parsed_args = collection.mutable.Map[String, String]() + var i = 0 + while (i < args.length) { + if (args(i)(0) == '-' && args(i)(1) == '-') { + if ( + i == args.length - 1 || (args(i + 1)(0) == '-' && args(i + 1)( + 1 + ) == '-') + ) { + // Last argument or next argument is also a flag + parsed_args(args(i).substring(2)) = "NoArg" + } else parsed_args(args(i).substring(2)) = args(i + 1) + } + i += 1 + } + if (parsed_args.size == 0) { + println("No arguments provided. Please provide arguments") + sys.exit(1) + } + parsed_args + } +} diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/DMAExtension.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/DMAExtension.scala index 903f2662b..2c54d736d 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/DMAExtension.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/DMAExtension.scala @@ -6,6 +6,7 @@ import chisel3.util._ import snax.utils._ import snax.xdma.CommonCells._ import snax.xdma.DesignParams._ +import os.copy.over /** The parent (abstract) Class for the DMA Extension Generation Params This * class template is used to isolate the definition of class (when user provide @@ -24,7 +25,8 @@ abstract class HasDMAExtension { implicit val extensionParam: DMAExtensionParam def totalCsrNum = extensionParam.userCsrNum + 1 - def instantiate: DMAExtension + def namePostfix = "_xdma_extension_" + extensionParam.moduleName + def instantiate(clusterName: String): DMAExtension } /** The parent (abstract) Class for the DMA Extension Implementation (Circuit) @@ -45,8 +47,6 @@ abstract class DMAExtension(implicit extensionParam: DMAExtensionParam) extends Module with RequireAsyncReset { - override def desiredName: String = extensionParam.moduleName - val io = IO(new Bundle { val csr_i = Input( Vec(extensionParam.userCsrNum + 1, UInt(32.W)) @@ -74,7 +74,9 @@ abstract class DMAExtension(implicit extensionParam: DMAExtensionParam) // Structure to bypass extension: Demux private[this] val inputDemux = Module( - new DemuxDecoupled(UInt(extensionParam.dataWidth.W), numOutput = 2) + new DemuxDecoupled(UInt(extensionParam.dataWidth.W), numOutput = 2) { + override def desiredName = s"xdma_extension_inputDemux" + } ) inputDemux.io.sel := bypass inputDemux.io.in <> io.data_i @@ -85,7 +87,9 @@ abstract class DMAExtension(implicit extensionParam: DMAExtensionParam) // Structure to bypass extension: Mux private[this] val outputMux = Module( - new MuxDecoupled(UInt(extensionParam.dataWidth.W), numInput = 2) + new MuxDecoupled(UInt(extensionParam.dataWidth.W), numInput = 2) { + override def desiredName = s"xdma_extension_outputMux" + } ) outputMux.io.sel := bypass outputMux.io.out <> io.data_o diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/MaxPool.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/MaxPool.scala index 1b7e2a63e..dcbd8c527 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/MaxPool.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/MaxPool.scala @@ -32,8 +32,10 @@ object HasMaxPool extends HasDMAExtension { userCsrNum = 1, dataWidth = 512 ) - def instantiate: MaxPool = Module( - new MaxPool(elementWidth = 8) + def instantiate(clusterName: String): MaxPool = Module( + new MaxPool(elementWidth = 8) { + override def desiredName = clusterName + namePostfix + } ) } @@ -43,11 +45,13 @@ class MaxPool(elementWidth: Int)(implicit extensionParam: DMAExtensionParam) // Counter to record the steps // 256-element MaxPool maximum - val counters = Module(new snax.xdma.xdmaStreamer.BasicCounter(8)) - counters.io.ceil := ext_csr_i(0) - counters.io.reset := ext_start_i - counters.io.tick := ext_data_i.fire - ext_busy_o := counters.io.value =/= 0.U + val counter = Module(new snax.xdma.xdmaStreamer.BasicCounter(8) { + override val desiredName = "xdma_extension_MaxPoolCounter" + }) + counter.io.ceil := ext_csr_i(0) + counter.io.reset := ext_start_i + counter.io.tick := ext_data_i.fire + ext_busy_o := counter.io.value =/= 0.U // The wire to connect the output result val ext_data_o_bits = Wire( @@ -57,7 +61,7 @@ class MaxPool(elementWidth: Int)(implicit extensionParam: DMAExtensionParam) val PEs = for (i <- 0 until extensionParam.dataWidth / elementWidth) yield { val PE = Module(new MAXPoolPE(dataWidth = elementWidth)) - PE.io.init_i := counters.io.value === 0.U + PE.io.init_i := counter.io.value === 0.U PE.io.data_i.valid := ext_data_i.fire PE.io.data_i.bits := ext_data_i .bits((i + 1) * elementWidth - 1, i * elementWidth) @@ -79,7 +83,7 @@ class MaxPool(elementWidth: Int)(implicit extensionParam: DMAExtensionParam) // Under this condition, the system does not need to send the sum to the next stage ext_data_i.ready := true.B ext_data_o.valid := false.B - when(ext_data_i.fire && counters.io.lastVal) { + when(ext_data_i.fire && counter.io.lastVal) { // The result is about to be ready, switching state to output current_state := s_output } diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/Memset.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/Memset.scala index 227b3f2e8..3a3c228dd 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/Memset.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/Memset.scala @@ -10,7 +10,9 @@ object HasMemset extends HasDMAExtension { userCsrNum = 1, dataWidth = 512 ) - def instantiate: Memset = Module(new Memset) + def instantiate(clusterName: String): Memset = Module(new Memset { + override def desiredName = clusterName + namePostfix + }) } class Memset()(implicit extensionParam: DMAExtensionParam) diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/Transposer.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/Transposer.scala index 8175182cd..7f0b9a29c 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/Transposer.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaExtension/Transposer.scala @@ -11,7 +11,9 @@ object HasTransposer extends HasDMAExtension { dataWidth = 512 ) - def instantiate: Transposer = Module(new Transposer) + def instantiate(clusterName: String): Transposer = Module(new Transposer { + override def desiredName = clusterName + namePostfix + }) } class Transposer()(implicit extensionParam: DMAExtensionParam) diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaFrontend/DMACtrl.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaFrontend/DMACtrl.scala index 0284e4508..5a994c105 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaFrontend/DMACtrl.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaFrontend/DMACtrl.scala @@ -46,9 +46,15 @@ class DMACtrlIO( val csrIO = new SnaxCsrIO(csrAddrWidth = 32) } -class SrcConfigRouter(dataType: DMADataPathCfgInternalIO, tcdmSize: Int) - extends Module +class SrcConfigRouter( + dataType: DMADataPathCfgInternalIO, + tcdmSize: Int, + clusterName: String = "unnamed_cluster" +) extends Module with RequireAsyncReset { + + override val desiredName = s"${clusterName}_xdma_ctrl_srcConfigRouter" + val io = IO(new Bundle { val clusterBaseAddress = Input(dataType.agu_cfg.Ptr) val from = Flipped(new Bundle { @@ -61,12 +67,18 @@ class SrcConfigRouter(dataType: DMADataPathCfgInternalIO, tcdmSize: Int) } }) - val i_from_arbiter = Module(new Arbiter(dataType, 2)) + val i_from_arbiter = Module(new Arbiter(dataType, 2) { + override val desiredName = + s"${clusterName}_xdma_ctrl_SrcConfigRouter_Arbiter" + }) i_from_arbiter.io.in(0) <> io.from.local i_from_arbiter.io.in(1) <> io.from.remote val i_to_demux = Module( - new DemuxDecoupled(dataType = dataType, numOutput = 3) + new DemuxDecoupled(dataType = dataType, numOutput = 3) { + override val desiredName = + s"${clusterName}_xdma_ctrl_SrcConfigRouter_Demux" + } ) i_from_arbiter.io.out -|> i_to_demux.io.in @@ -101,9 +113,15 @@ class SrcConfigRouter(dataType: DMADataPathCfgInternalIO, tcdmSize: Int) i_to_demux.io.out(cType_discard.litValue.toInt).ready := true.B } -class DstConfigRouter(dataType: DMADataPathCfgInternalIO, tcdmSize: Int) - extends Module +class DstConfigRouter( + dataType: DMADataPathCfgInternalIO, + tcdmSize: Int, + clusterName: String = "unnamed_cluster" +) extends Module with RequireAsyncReset { + + override val desiredName = s"${clusterName}_xdma_ctrl_dstConfigRouter" + val io = IO(new Bundle { val clusterBaseAddress = Input(dataType.agu_cfg.Ptr) val from = Flipped(new Bundle { @@ -114,7 +132,10 @@ class DstConfigRouter(dataType: DMADataPathCfgInternalIO, tcdmSize: Int) } }) val i_to_demux = Module( - new DemuxDecoupled(dataType = dataType, numOutput = 2) + new DemuxDecoupled(dataType = dataType, numOutput = 2) { + override val desiredName = + s"${clusterName}_xdma_ctrl_dstConfigRouter_Demux" + } ) io.from.local <> i_to_demux.io.in @@ -149,7 +170,8 @@ class DMACtrl( readerparam: DMADataPathParam, writerparam: DMADataPathParam, axiWidth: Int = 512, - csrAddrWidth: Int = 32 + csrAddrWidth: Int = 32, + clusterName: String = "unnamed_cluster" ) extends Module with RequireAsyncReset { val io = IO( @@ -160,6 +182,8 @@ class DMACtrl( ) ) + override val desiredName = s"${clusterName}_xdma_ctrl" + val i_csrmanager = Module( new CsrManager( csrNumReadWrite = 2 + // Reader Pointer needs two CSRs @@ -179,7 +203,7 @@ class DMACtrl( // Set to two at current, 1) The number of submitted request; 2) The number of finished request. Since the reader path may be forward to remote, here I only count the writer branch csrAddrWidth = csrAddrWidth, // Set a name for the module class so that it will not overlapped with other csrManagers in user-defined accelerators - csrModuleTagName = "xDMA" + csrModuleTagName = s"${clusterName}_xdma_" ) ) @@ -281,7 +305,8 @@ class DMACtrl( val i_srcCfgRouter = Module( new SrcConfigRouter( dataType = chiselTypeOf(preRoute_src_local.bits), - tcdmSize = readerparam.rwParam.tcdm_param.tcdmSize + tcdmSize = readerparam.rwParam.tcdm_param.tcdmSize, + clusterName = clusterName ) ) i_srcCfgRouter.io.clusterBaseAddress := io.clusterBaseAddress @@ -308,7 +333,8 @@ class DMACtrl( val i_dstCfgRouter = Module( new DstConfigRouter( dataType = chiselTypeOf(preRoute_dst_local.bits), - tcdmSize = writerparam.rwParam.tcdm_param.tcdmSize + tcdmSize = writerparam.rwParam.tcdm_param.tcdmSize, + clusterName = clusterName ) ) i_dstCfgRouter.io.clusterBaseAddress := io.clusterBaseAddress @@ -322,10 +348,14 @@ class DMACtrl( // Loopback / Non-loopback seperation for pseudo-OoO commit val i_src_LoopbackDemux = Module( - new DemuxDecoupled(chiselTypeOf(postRoute_src_local.bits), numOutput = 2) + new DemuxDecoupled(chiselTypeOf(postRoute_src_local.bits), numOutput = 2) { + override val desiredName = s"${clusterName}_xdma_ctrl_src_LoopbackDemux" + } ) val i_dst_LoopbackDemux = Module( - new DemuxDecoupled(chiselTypeOf(postRoute_dst_local.bits), numOutput = 2) + new DemuxDecoupled(chiselTypeOf(postRoute_dst_local.bits), numOutput = 2) { + override val desiredName = s"${clusterName}_xdma_ctrl_dst_LoopbackDemux" + } ) // (1) is loopback; (0) is non-loopback @@ -335,7 +365,9 @@ class DMACtrl( i_dst_LoopbackDemux.io.in <> postRoute_dst_local val i_srcCfgArbiter = Module( - new Arbiter(chiselTypeOf(postRoute_src_local.bits), 2) + new Arbiter(chiselTypeOf(postRoute_src_local.bits), 2) { + override val desiredName = s"${clusterName}_xdma_ctrl_srcCfgArbiter" + } ) // Non-loopback has lower priority, so that it is connect to 1st port of arbiter // Optional FIFO for non-loopback cfg is added (depth = 2) @@ -345,7 +377,9 @@ class DMACtrl( i_src_LoopbackDemux.io.out(1) <> i_srcCfgArbiter.io.in(0) val i_dstCfgArbiter = Module( - new Arbiter(chiselTypeOf(postRoute_dst_local.bits), 2) + new Arbiter(chiselTypeOf(postRoute_dst_local.bits), 2) { + override val desiredName = s"${clusterName}_xdma_ctrl_dstCfgArbiter" + } ) // Non-loopback has lower priority, so that it is connect to 1st port of arbiter // Optional FIFO for non-loopback cfg is added (depth = 2) @@ -425,13 +459,17 @@ class DMACtrl( io.localDMADataPath.writer_cfg_o := current_cfg_dst.bits // Counter for submitted cfg and finished cfg (With these two values, the control core knows which task is finished) - val i_submittedTaskCounter = Module(new BasicCounter(32, hasCeil = false)) + val i_submittedTaskCounter = Module(new BasicCounter(32, hasCeil = false) { + override val desiredName = s"${clusterName}_xdma_ctrl_submittedTaskCounter" + }) i_submittedTaskCounter.io.ceil := DontCare i_submittedTaskCounter.io.reset := false.B i_submittedTaskCounter.io.tick := i_csrmanager.io.csr_config_out.fire i_csrmanager.io.read_only_csr(0) := i_submittedTaskCounter.io.value - val i_finishedTaskCounter = Module(new BasicCounter(32, hasCeil = false)) + val i_finishedTaskCounter = Module(new BasicCounter(32, hasCeil = false) { + override val desiredName = s"${clusterName}_xdma_ctrl_finishedTaskCounter" + }) i_finishedTaskCounter.io.ceil := DontCare i_finishedTaskCounter.io.reset := false.B i_finishedTaskCounter.io.tick := (RegNext( diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaFrontend/DMADataPath.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaFrontend/DMADataPath.scala index 4a4544b96..c557fb1be 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaFrontend/DMADataPath.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaFrontend/DMADataPath.scala @@ -92,9 +92,15 @@ class DMADataPathCfgInternalIO(param: DMADataPathParam) } -class DMADataPath(readerparam: DMADataPathParam, writerparam: DMADataPathParam) - extends Module +class DMADataPath( + readerparam: DMADataPathParam, + writerparam: DMADataPathParam, + clusterName: String = "unnamed_cluster" +) extends Module with RequireAsyncReset { + + override val desiredName = s"${clusterName}_xdma_datapath" + val io = IO(new Bundle { // All config signal for reader and writer val reader_cfg_i = Input(new DMADataPathCfgInternalIO(readerparam)) @@ -163,8 +169,12 @@ class DMADataPath(readerparam: DMADataPathParam, writerparam: DMADataPathParam) } }) - val i_reader = Module(new Reader(readerparam.rwParam)) - val i_writer = Module(new Writer(writerparam.rwParam)) + val i_reader = Module( + new Reader(readerparam.rwParam, clusterName = clusterName) + ) + val i_writer = Module( + new Writer(writerparam.rwParam, clusterName = clusterName) + ) // Connect TCDM memory to reader and writer i_reader.io.tcdm_req <> io.tcdm_reader.req @@ -194,7 +204,7 @@ class DMADataPath(readerparam: DMADataPathParam, writerparam: DMADataPathParam) var remainingCSR = io.reader_cfg_i.ext_cfg.toIndexedSeq // Give an alias to all extension's csr for a easier manipulation val i_reader_extentionList = for (i <- readerparam.extParam) yield { - val extension = i.instantiate + val extension = i.instantiate(clusterName = clusterName) extension.io.csr_i := remainingCSR.take(extension.io.csr_i.length) remainingCSR = remainingCSR.drop(extension.io.csr_i.length) extension @@ -248,7 +258,7 @@ class DMADataPath(readerparam: DMADataPathParam, writerparam: DMADataPathParam) var remainingCSR = io.writer_cfg_i.ext_cfg.toIndexedSeq // Give an alias to all extension's csr for a easier manipulation val i_writer_extentionList = for (i <- writerparam.extParam) yield { - val extension = i.instantiate + val extension = i.instantiate(clusterName = clusterName) extension.io.csr_i := remainingCSR.take(extension.io.csr_i.length) remainingCSR = remainingCSR.drop(extension.io.csr_i.length) extension diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/AddressGenUnit.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/AddressGenUnit.scala index b3b569f83..9bbc8c2b4 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/AddressGenUnit.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/AddressGenUnit.scala @@ -36,7 +36,6 @@ class BasicCounter(width: Int, hasCeil: Boolean = true) io.lastVal := { if (hasCeil) (value === io.ceil - 1.U) else (value.andR) } - // io.overFlow := (value === 0.U) && (valuePrevious === io.ceil - 1.U) } /** AGU is the module to automatically generate the address for all ports. @@ -61,8 +60,10 @@ class AddressGenUnitCfgIO(param: AddressGenUnitParam) extends Bundle { val Bounds = Vec(param.dimension, UInt(16.W)) } -class AddressGenUnit(param: AddressGenUnitParam) - extends Module +class AddressGenUnit( + param: AddressGenUnitParam, + module_name_prefix: String = "unnamed_cluster" +) extends Module with RequireAsyncReset { val io = IO(new Bundle { val cfg = Input(new AddressGenUnitCfgIO(param)) @@ -77,8 +78,12 @@ class AddressGenUnit(param: AddressGenUnitParam) Vec(param.spatialUnrollingFactor, Decoupled(UInt(param.addressWidth.W))) }) + override val desiredName = s"${module_name_prefix}_AddressGenUnit" + // Create a counter to count from 0 to product(bounds) - val counter = Module(new BasicCounter(32)) + val counter = Module(new BasicCounter(32) { + override val desiredName = s"${module_name_prefix}_AddressGenUnit_Counter" + }) // When start signal is high, the counter is rest to zero. counter.io.reset := io.start @@ -88,7 +93,9 @@ class AddressGenUnit(param: AddressGenUnitParam) inputWidth = io.addr.head.bits.getWidth * param.spatialUnrollingFactor, outputWidth = io.addr.head.bits.getWidth, depth = param.outputBufferDepth - ) + ) { + override val desiredName = s"${module_name_prefix}_AddressBufferFIFO" + } ) // The FSM to record if the AddressGenUnit is busy @@ -151,5 +158,4 @@ class AddressGenUnit(param: AddressGenUnitParam) // Connect io.bufferEmpty signal: If all output is 0, then all addresses are empty, which means io.bufferEmpty should be high io.bufferEmpty := ~(outputBuffer.io.out.map(i => i.valid).reduce(_ | _)) - } diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/DataRequestor.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/DataRequestor.scala index b2a13d597..9b89b1e43 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/DataRequestor.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/DataRequestor.scala @@ -94,15 +94,19 @@ class DataRequestors( tcdmDataWidth: Int, tcdmAddressWidth: Int, isReader: Boolean, - numChannel: Int + numChannel: Int, + module_name_prefix: String = "unnamed_cluster" ) extends Module with RequireAsyncReset { + override val desiredName = s"${module_name_prefix}_DataRequestors" val io = IO( new DataRequestorsIO(tcdmDataWidth, tcdmAddressWidth, isReader, numChannel) ) val DataRequestor = for (i <- 0 until numChannel) yield { val module = Module( - new DataRequestor(tcdmDataWidth, tcdmAddressWidth, isReader) + new DataRequestor(tcdmDataWidth, tcdmAddressWidth, isReader) { + override def desiredName = s"${module_name_prefix}_DataRequestor" + } ) // Address is unconditionally connected diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/DataResponser.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/DataResponser.scala index 700a90bf7..19a4819e2 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/DataResponser.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/DataResponser.scala @@ -46,15 +46,21 @@ class DataResponsersIO(tcdmDataWidth: Int = 64, numChannel: Int = 8) } } -class DataResponsers(tcdmDataWidth: Int = 64, numChannel: Int = 8) - extends Module +class DataResponsers( + tcdmDataWidth: Int = 64, + numChannel: Int = 8, + module_name_prefix: String = "unnamed_cluster" +) extends Module with RequireAsyncReset { val io = IO( new DataResponsersIO(tcdmDataWidth = tcdmDataWidth, numChannel = numChannel) ) + override val desiredName = s"${module_name_prefix}_DataResponsers" // Instantiation and connection val DataResponser = for (i <- 0 until numChannel) yield { - val module = Module(new DataResponser(tcdmDataWidth = tcdmDataWidth)) + val module = Module(new DataResponser(tcdmDataWidth = tcdmDataWidth) { + override val desiredName = s"${module_name_prefix}_DataResponser" + }) io.tcdm_rsp(i) <> module.io.tcdm_rsp io.out.data(i) <> module.io.out.data io.out.ResponsorReady(i) := module.io.out.ResponsorReady diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/Reader.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/Reader.scala index 71ff11d45..2201c36aa 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/Reader.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/Reader.scala @@ -9,7 +9,12 @@ import snax.xdma.DesignParams._ // The reader takes the address from the AGU, offer to requestor, and responser collect the data from TCDM and pushed to FIFO packer to recombine into 512 bit data -class Reader(param: ReaderWriterParam) extends Module with RequireAsyncReset { +class Reader(param: ReaderWriterParam, clusterName: String = "unnamed_cluster") + extends Module + with RequireAsyncReset { + + override val desiredName = s"${clusterName}_xdma_Reader" + val io = IO(new Bundle { val cfg = Input(new AddressGenUnitCfgIO(param.agu_param)) val tcdm_req = Vec( @@ -38,7 +43,12 @@ class Reader(param: ReaderWriterParam) extends Module with RequireAsyncReset { }) // Address Generator - val addressgen = Module(new AddressGenUnit(param.agu_param)) + val addressgen = Module( + new AddressGenUnit( + param.agu_param, + module_name_prefix = s"${clusterName}_xdma_Reader" + ) + ) // Requestors to send address to TCDM val requestors = Module( @@ -46,7 +56,8 @@ class Reader(param: ReaderWriterParam) extends Module with RequireAsyncReset { tcdmDataWidth = param.tcdm_param.dataWidth, tcdmAddressWidth = param.tcdm_param.addrWidth, numChannel = param.tcdm_param.numChannel, - isReader = true + isReader = true, + module_name_prefix = s"${clusterName}_xdma_Reader" ) ) @@ -54,7 +65,8 @@ class Reader(param: ReaderWriterParam) extends Module with RequireAsyncReset { val responsers = Module( new DataResponsers( tcdmDataWidth = param.tcdm_param.dataWidth, - numChannel = param.tcdm_param.numChannel + numChannel = param.tcdm_param.numChannel, + module_name_prefix = s"${clusterName}_xdma_Reader" ) ) @@ -64,7 +76,9 @@ class Reader(param: ReaderWriterParam) extends Module with RequireAsyncReset { inputWidth = param.tcdm_param.dataWidth, outputWidth = param.tcdm_param.dataWidth * param.tcdm_param.numChannel, depth = param.bufferDepth - ) + ) { + override val desiredName = s"${clusterName}_xdma_Reader_DataBuffer" + } ) addressgen.io.cfg := io.cfg diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/Writer.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/Writer.scala index b2c8fbd53..08d397d30 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/Writer.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaStreamer/Writer.scala @@ -8,7 +8,12 @@ import snax.utils._ import snax.xdma.CommonCells._ import snax.xdma.DesignParams._ -class Writer(param: ReaderWriterParam) extends Module with RequireAsyncReset { +class Writer(param: ReaderWriterParam, clusterName: String = "unnamed_cluster") + extends Module + with RequireAsyncReset { + + override val desiredName = s"${clusterName}_xdma_Writer" + val io = IO(new Bundle { val cfg = Input(new AddressGenUnitCfgIO(param.agu_param)) val tcdm_req = Vec( @@ -33,7 +38,12 @@ class Writer(param: ReaderWriterParam) extends Module with RequireAsyncReset { val bufferEmpty = Output(Bool()) }) - val addressgen = Module(new AddressGenUnit(param.agu_param)) + val addressgen = Module( + new AddressGenUnit( + param.agu_param, + module_name_prefix = s"${clusterName}_xdma_Writer" + ) + ) // Write Requestors // Requestors to send address and data to TCDM val requestors = Module( @@ -41,7 +51,8 @@ class Writer(param: ReaderWriterParam) extends Module with RequireAsyncReset { tcdmDataWidth = param.tcdm_param.dataWidth, tcdmAddressWidth = param.tcdm_param.addrWidth, numChannel = param.tcdm_param.numChannel, - isReader = false + isReader = false, + module_name_prefix = s"${clusterName}_xdma_Writer" ) ) @@ -50,7 +61,9 @@ class Writer(param: ReaderWriterParam) extends Module with RequireAsyncReset { inputWidth = param.tcdm_param.dataWidth * param.tcdm_param.numChannel, outputWidth = param.tcdm_param.dataWidth, depth = param.bufferDepth - ) + ) { + override val desiredName = s"${clusterName}_xdma_Writer_DataBuffer" + } ) addressgen.io.cfg := io.cfg diff --git a/hw/chisel/src/main/scala/snax/xdma/xdmaTop/xdmaTop.scala b/hw/chisel/src/main/scala/snax/xdma/xdmaTop/xdmaTop.scala index 018c3cdb9..abad7743d 100644 --- a/hw/chisel/src/main/scala/snax/xdma/xdmaTop/xdmaTop.scala +++ b/hw/chisel/src/main/scala/snax/xdma/xdmaTop/xdmaTop.scala @@ -7,10 +7,9 @@ import snax.csr_manager._ import snax.utils._ import snax.xdma.xdmaFrontend._ -import snax.xdma.xdmaExtension.HasMemset -import snax.xdma.xdmaExtension.HasTransposer -import snax.xdma.xdmaExtension.HasMaxPool +import snax.xdma.xdmaExtension._ import snax.xdma.DesignParams._ +import os.write class xdmaTopIO( readerparam: DMADataPathParam, @@ -78,9 +77,11 @@ class xdmaTop( readerparam: DMADataPathParam, writerparam: DMADataPathParam, axiWidth: Int = 512, - csrAddrWidth: Int = 32 + csrAddrWidth: Int = 32, + clusterName: String = "unnamed_cluster" ) extends Module with RequireAsyncReset { + override val desiredName = s"${clusterName}_xdma" val io = IO( new xdmaTopIO( readerparam = readerparam, @@ -92,6 +93,7 @@ class xdmaTop( val i_dmactrl = Module( new DMACtrl( + clusterName = clusterName, readerparam = readerparam, writerparam = writerparam, axiWidth = axiWidth, @@ -101,6 +103,7 @@ class xdmaTop( val i_dmadatapath = Module( new DMADataPath( + clusterName = clusterName, readerparam = readerparam, writerparam = writerparam ) @@ -138,15 +141,74 @@ class xdmaTop( } -object xdmaTopTester extends App { +object xdmaTopEmitter extends App { emitVerilog( new xdmaTop( + clusterName = "test_cluster", readerparam = new DMADataPathParam(new ReaderWriterParam, Seq()), writerparam = new DMADataPathParam( new ReaderWriterParam, - Seq(HasMemset, HasTransposer, HasMaxPool) + Seq(HasMaxPool, HasMemset, HasTransposer) ) ), args = Array("--target-dir", "generated") ) } + +object xdmaTopGen extends App { + val parsed_args = snax.utils.ArgParser.parse(args) + + /* + Needed Parameters: + tcdmDataWidth: Int + axiDataWidth: Int + addressWidth: Int + tcdmSize: Int + + readerDimension: Int + writerDimension: Int + readerBufferDepth: Int + writerBufferDepth: Int + HasMemset + HasMaxPool + HasTranspopser + */ + + val readerparam = new ReaderWriterParam( + dimension = parsed_args("readerDimension").toInt, + tcdmDataWidth = parsed_args("tcdmDataWidth").toInt, + tcdmSize = parsed_args("tcdmSize").toInt, + tcdmAddressWidth = parsed_args("addressWidth").toInt, + numChannel = + parsed_args("axiDataWidth").toInt / parsed_args("tcdmDataWidth").toInt, + addressBufferDepth = parsed_args("readerBufferDepth").toInt + ) + + val writerparam = new ReaderWriterParam( + dimension = parsed_args("writerDimension").toInt, + tcdmDataWidth = parsed_args("tcdmDataWidth").toInt, + tcdmSize = parsed_args("tcdmSize").toInt, + tcdmAddressWidth = parsed_args("addressWidth").toInt, + numChannel = + parsed_args("axiDataWidth").toInt / parsed_args("tcdmDataWidth").toInt, + addressBufferDepth = parsed_args("writerBufferDepth").toInt + ) + + var extensionparam = Seq[HasDMAExtension]() + if (parsed_args.contains("HasMemset")) + extensionparam = extensionparam :+ HasMemset + if (parsed_args.contains("HasMaxPool")) + extensionparam = extensionparam :+ HasMaxPool + if (parsed_args.contains("HasTransposer")) + extensionparam = extensionparam :+ HasTransposer + + emitVerilog( + new xdmaTop( + clusterName = parsed_args.getOrElse("clusterName", ""), + readerparam = new DMADataPathParam(readerparam, Seq()), + writerparam = new DMADataPathParam(writerparam, extensionparam) + ), + args = + Array("--target-dir", parsed_args.getOrElse("target-dir", "generated")) + ) +} diff --git a/hw/chisel/src/test/scala/snax/xdma/xdmaExtension/DMAExtensionTester.scala b/hw/chisel/src/test/scala/snax/xdma/xdmaExtension/DMAExtensionTester.scala index 13eb13563..93adea74a 100644 --- a/hw/chisel/src/test/scala/snax/xdma/xdmaExtension/DMAExtensionTester.scala +++ b/hw/chisel/src/test/scala/snax/xdma/xdmaExtension/DMAExtensionTester.scala @@ -23,7 +23,7 @@ import scala.util.Random class DMAExtensionHarness(extension: HasDMAExtension) extends Module with RequireAsyncReset { - val dut = extension.instantiate + val dut = extension.instantiate("dma_extension_dut") val io = IO(chiselTypeOf(dut.io)) io.busy_o := dut.io.busy_o diff --git a/hw/chisel_acc/.gitignore b/hw/chisel_acc/.gitignore new file mode 100644 index 000000000..d31f6eff8 --- /dev/null +++ b/hw/chisel_acc/.gitignore @@ -0,0 +1,3 @@ +generated/ +project/ +target/ diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl index 0da6524cd..79a878a99 100644 --- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl +++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl @@ -341,6 +341,7 @@ for i in range(len(cfg['cores'])): curr_snax_acc_core = 'snax_core_' + str(i) snax_acc_dict = {} snax_acc_flag = False + snax_xdma_flag = False snax_acc_multi_flag = False snax_use_custom_ports = False snax_num_acc = None @@ -392,6 +393,38 @@ for i in range(len(cfg['cores'])): total_snax_narrow_ports += snax_narrow_tcdm_ports total_snax_wide_ports += snax_wide_tcdm_ports + elif ('snax_xdma_cfg' in cfg['cores'][i]): + snax_xdma_flag = True + xdma_cfg = cfg['cores'][i]['snax_xdma_cfg'] + # Note that the order is from last core to the first core + snax_narrow_tcdm_ports_list.append(round(cfg['dma_data_width'] / cfg['data_width']) << 1) + snax_wide_tcdm_ports_list.append(0) + + # Prepare accelerator tags + xdma_instance_name = "xdma" + for key, value in xdma_cfg.items(): + if key.startswith('has_'): + xdma_instance_name += ("_" + key[4:]) + curr_snax_acc = '' + curr_snax_acc = "i_snax_core_" + str(i) + "_" + xdma_instance_name + + # Set tcdm offset ports + snax_narrow_tcdm_ports = round(cfg['dma_data_width'] / cfg['data_width']) * 2 + snax_wide_tcdm_ports = 0 + snax_tcdm_ports = snax_narrow_tcdm_ports + snax_wide_tcdm_ports + tcdm_offset_stop += snax_tcdm_ports + + # Save settings in the dictionary + snax_acc_dict[curr_snax_acc] = { + 'snax_acc_name': xdma_instance_name, + 'snax_tcdm_ports': snax_tcdm_ports, + 'snax_tcdm_offset_start': tcdm_offset_start, + 'snax_tcdm_offset_stop': tcdm_offset_stop + } + tcdm_offset_start += snax_tcdm_ports + total_snax_narrow_ports += snax_narrow_tcdm_ports + total_snax_wide_ports += snax_wide_tcdm_ports + else: # Consider cases without accelerators @@ -406,6 +439,7 @@ for i in range(len(cfg['cores'])): # This is the packed configuration snax_core_acc[curr_snax_acc_core] = { 'snax_acc_flag': snax_acc_flag, + 'snax_xdma_flag': snax_xdma_flag, 'snax_acc_multi_flag':snax_acc_multi_flag, 'snax_use_custom_ports': snax_use_custom_ports, 'snax_total_num_csr': snax_total_num_csr, @@ -826,6 +860,55 @@ total_snax_tcdm_ports = total_snax_narrow_ports + total_snax_wide_ports % endfor % endif + + % elif snax_core_acc[idx_key]['snax_xdma_flag']: + % for jdx, jdx_key in enumerate(snax_core_acc[idx_key]['snax_acc_dict']): + // Instantiation of xdma wrapper + ${cfg['name']}_xdma_wrapper # ( + .tcdm_req_t ( ${cfg['pkg_name']}::tcdm_req_t ), + .tcdm_rsp_t ( ${cfg['pkg_name']}::tcdm_rsp_t ) + ) ${jdx_key} ( + //----------------------------- + // Clock and reset + //----------------------------- + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + //----------------------------- + // Cluster Base Address + //----------------------------- + .cluster_base_addr_i( cluster_base_addr_i ), + //----------------------------- + // CSR format control ports + //----------------------------- + // Request + .csr_req_bits_data_i ( snax_csr_req_data [${idx}] ), + .csr_req_bits_addr_i ( snax_csr_req_addr [${idx}] ), + .csr_req_bits_write_i ( snax_csr_req_write[${idx}] ), + .csr_req_valid_i ( snax_csr_req_valid[${idx}] ), + .csr_req_ready_o ( snax_csr_req_ready[${idx}] ), + // Response + .csr_rsp_bits_data_o ( snax_csr_rsp_data [${idx}] ), + .csr_rsp_valid_o ( snax_csr_rsp_valid[${idx}] ), + .csr_rsp_ready_i ( snax_csr_rsp_ready[${idx}] ), + //----------------------------- + // Hardware barrier is not supported by xdma at the moment + //----------------------------- + //----------------------------- + // TCDM ports + //----------------------------- + .tcdm_req_o ( snax_tcdm_req[${snax_core_acc[idx_key]['snax_acc_dict'][jdx_key]['snax_tcdm_offset_stop']}:${snax_core_acc[idx_key]['snax_acc_dict'][jdx_key]['snax_tcdm_offset_start']}] ), + .tcdm_rsp_i ( snax_tcdm_rsp[${snax_core_acc[idx_key]['snax_acc_dict'][jdx_key]['snax_tcdm_offset_stop']}:${snax_core_acc[idx_key]['snax_acc_dict'][jdx_key]['snax_tcdm_offset_start']}] ) + ); + + // Tie unused custom instruction ports to 0 + assign snax_qready [${idx}] = '0; + assign snax_resp [${idx}] = '0; + assign snax_pvalid [${idx}] = '0; + // Tie barrier to 0 + assign snax_barrier [${idx}] = '0; + + % endfor + % else: // If no accelerator is connected to Snitch core diff --git a/hw/templates/snax_xdma_wrapper.sv.tpl b/hw/templates/snax_xdma_wrapper.sv.tpl new file mode 100644 index 000000000..8f592f9b1 --- /dev/null +++ b/hw/templates/snax_xdma_wrapper.sv.tpl @@ -0,0 +1,181 @@ +// Copyright 2024 KU Leuven. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +<% + num_tcdm_ports = 0 + + num_tcdm_ports = round(cfg["dma_data_width"] / cfg["data_width"] * 2) + ## Half of them are used for the reader, and half of them are used for writer + +%> +//----------------------------- +// xdma wrapper +//----------------------------- +module ${cfg["name"]}_xdma_wrapper #( + // TCDM typedefs + parameter type tcdm_req_t = logic, + parameter type tcdm_rsp_t = logic, + // Parameters related to TCDM + parameter int unsigned TCDMDataWidth = ${cfg["data_width"]}, + parameter int unsigned TCDMNumPorts = ${num_tcdm_ports}, + parameter int unsigned PhysicalAddrWidth = ${cfg["addr_width"]} +)( + //----------------------------- + // Clocks and reset + //----------------------------- + input logic clk_i, + input logic rst_ni, + //----------------------------- + // Cluster base address + //----------------------------- + input logic [PhysicalAddrWidth-1:0] cluster_base_addr_i, + //----------------------------- + // TCDM ports + //----------------------------- + output tcdm_req_t [TCDMNumPorts-1:0] tcdm_req_o, + input tcdm_rsp_t [TCDMNumPorts-1:0] tcdm_rsp_i, + //----------------------------- + // CSR control ports + //----------------------------- + // Request + input logic [31:0] csr_req_bits_data_i, + input logic [31:0] csr_req_bits_addr_i, + input logic csr_req_bits_write_i, + input logic csr_req_valid_i, + output logic csr_req_ready_o, + // Response + output logic [31:0] csr_rsp_bits_data_o, + output logic csr_rsp_valid_o, + input logic csr_rsp_ready_i +); + + //----------------------------- + // Wiring and combinational logic + //----------------------------- + + // TCDM signals + // Request + logic [TCDMNumPorts-1:0][PhysicalAddrWidth-1:0] tcdm_req_addr; + logic [TCDMNumPorts-1:0] tcdm_req_write; + //Note that tcdm_req_amo_i is 4 bits based on reqrsp definition + logic [TCDMNumPorts-1:0][ 3:0] tcdm_req_amo; + logic [TCDMNumPorts-1:0][ TCDMDataWidth-1:0] tcdm_req_data; + logic [TCDMNumPorts-1:0][TCDMDataWidth/8-1:0] tcdm_req_strb; + //Note that tcdm_req_user_core_id_i is 5 bits based on Snitch definition + logic [TCDMNumPorts-1:0][ 4:0] tcdm_req_user_core_id; + logic [TCDMNumPorts-1:0] tcdm_req_user_is_core; + logic [TCDMNumPorts-1:0] tcdm_req_q_valid; + + // Response + logic [TCDMNumPorts-1:0] tcdm_rsp_q_ready; + logic [TCDMNumPorts-1:0] tcdm_rsp_p_valid; + logic [TCDMNumPorts-1:0][ TCDMDataWidth-1:0] tcdm_rsp_data; + + // Fixed ports that are defaulted to tie-low + // towards the TCDM from the streamer + always_comb begin + for(int i = 0; i < TCDMNumPorts; i++ ) begin + tcdm_req_amo [i] = '0; + tcdm_req_user_core_id [i] = '0; + tcdm_req_user_is_core [i] = '0; + tcdm_req_strb [i] = '1; + // Be aware that if we need the bit-level granularity, this part needs to be modified + end + end + + // Re-mapping wires for TCDM IO ports + always_comb begin + for ( int i = 0; i < TCDMNumPorts; i++) begin + tcdm_req_o[i].q.addr = tcdm_req_addr [i]; + tcdm_req_o[i].q.write = tcdm_req_write [i]; + tcdm_req_o[i].q.amo = reqrsp_pkg::AMONone; + tcdm_req_o[i].q.data = tcdm_req_data [i]; + tcdm_req_o[i].q.strb = '1; + tcdm_req_o[i].q.user.core_id = '0; + tcdm_req_o[i].q.user.is_core = '0; + tcdm_req_o[i].q_valid = tcdm_req_q_valid[i]; + + tcdm_rsp_q_ready[i] = tcdm_rsp_i[i].q_ready; + tcdm_rsp_p_valid[i] = tcdm_rsp_i[i].p_valid; + tcdm_rsp_data [i] = tcdm_rsp_i[i].p.data ; + end + end + + // Streamer module that is generated + // with template mechanics + ${cfg["name"]}_xdma i_${cfg["name"]}_xdma ( + //----------------------------- + // Clocks and reset + //----------------------------- + .clock ( clk_i ), + .reset ( ~rst_ni ), + + //----------------------------- + // Cluster base address + //----------------------------- + .io_clusterBaseAddress(cluster_base_addr_i), + //----------------------------- + // TCDM Ports + //----------------------------- + // Reader's Request + // Ready signal is very strange... ETH defines ready at rsp side, but we think it should at request-side (imagine system with outstanding request support) + +% for idx in range(0, num_tcdm_ports >> 1): + .io_tcdm_reader_req_${idx}_ready ( tcdm_rsp_q_ready[${idx}] ), + .io_tcdm_reader_req_${idx}_valid ( tcdm_req_q_valid[${idx}] ), + .io_tcdm_reader_req_${idx}_bits_addr ( tcdm_req_addr [${idx}] ), + .io_tcdm_reader_req_${idx}_bits_write ( tcdm_req_write [${idx}] ), + .io_tcdm_reader_req_${idx}_bits_data ( tcdm_req_data [${idx}] ), +% endfor + // Writer's Request +% for idx in range(0, num_tcdm_ports >> 1): + .io_tcdm_writer_req_${idx}_ready ( tcdm_rsp_q_ready[${idx + (num_tcdm_ports >> 1)}] ), + .io_tcdm_writer_req_${idx}_valid ( tcdm_req_q_valid[${idx + (num_tcdm_ports >> 1)}] ), + .io_tcdm_writer_req_${idx}_bits_addr ( tcdm_req_addr [${idx + (num_tcdm_ports >> 1)}] ), + .io_tcdm_writer_req_${idx}_bits_write ( tcdm_req_write [${idx + (num_tcdm_ports >> 1)}] ), + .io_tcdm_writer_req_${idx}_bits_data ( tcdm_req_data [${idx + (num_tcdm_ports >> 1)}] ), +% endfor + // Reader's Respose +% for idx in range(num_tcdm_ports >> 1): + .io_tcdm_reader_rsp_${idx}_valid ( tcdm_rsp_p_valid[${idx}] ), + .io_tcdm_reader_rsp_${idx}_bits_data( tcdm_rsp_data [${idx}] ), +% endfor + // Writer has no Respose + //----------------------------- + // CSR control ports + //----------------------------- + // Request + .io_csrIO_req_bits_data ( csr_req_bits_data_i ), + .io_csrIO_req_bits_addr ( csr_req_bits_addr_i ), + .io_csrIO_req_bits_write ( csr_req_bits_write_i ), + .io_csrIO_req_valid ( csr_req_valid_i ), + .io_csrIO_req_ready ( csr_req_ready_o ), + + // Response + .io_csrIO_rsp_bits_data ( csr_rsp_bits_data_o ), + .io_csrIO_rsp_valid ( csr_rsp_valid_o ), + .io_csrIO_rsp_ready ( csr_rsp_ready_i ), + //----------------------------- + // Tie-off unused AXI port + //----------------------------- + // Remote data + .io_remoteDMADataPath_fromRemote_valid ('0), + .io_remoteDMADataPath_fromRemote_ready ( ), + .io_remoteDMADataPath_fromRemote_bits ( ), + + .io_remoteDMADataPath_toRemote_ready ('0), + .io_remoteDMADataPath_toRemote_valid ( ), + .io_remoteDMADataPath_toRemote_bits ( ), + + // Remote cfg + .io_remoteDMADataPathCfg_fromRemote_valid ('0), + .io_remoteDMADataPathCfg_fromRemote_ready ( ), + .io_remoteDMADataPathCfg_fromRemote_bits ( ), + + .io_remoteDMADataPathCfg_toRemote_ready ('0), + .io_remoteDMADataPathCfg_toRemote_valid ( ), + .io_remoteDMADataPathCfg_toRemote_bits ( ) + ); + +endmodule diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile index 704e63907..eb6c70a3a 100644 --- a/target/snitch_cluster/Makefile +++ b/target/snitch_cluster/Makefile @@ -190,6 +190,27 @@ $(eval $(call generate_snax_gen,snax_streamer_gemmX)) endif +ifeq (${CFG_OVERRIDE}, cfg/snax-streamer-gemmX-xdma.hjson) + + CLUSTER_NAME = snax_streamer_gemmX_xdma_cluster + +$(eval $(call generate_snax_gen,snax_streamer_gemmX)) + + SNAX_GEMMX_ROOT ?= $(ROOT)/hw/chisel_acc + include $(SNAX_GEMMX_ROOT)/Makefile + +# This is a temporary solution before integrating xdma into snitch_cluster.sv + + SNAX_GEN += \ + $(GENERATED_DIR)/$(CLUSTER_NAME)_xdma/$(CLUSTER_NAME)_xdma.sv \ + $(GENERATED_DIR)/$(CLUSTER_NAME)_xdma/$(CLUSTER_NAME)_xdma_wrapper.sv + + VSIM_BENDER += -t snax_streamer_gemmX_xdma + VLT_BENDER += -t snax_streamer_gemmX_xdma + VCS_BENDER += -t snax_streamer_gemmX_xdma + +endif + ifeq (${CFG_OVERRIDE}, cfg/snax-wide-gemm-data-reshuffler.hjson) CLUSTER_NAME = snax_streamer_gemm_cluster diff --git a/target/snitch_cluster/cfg/snax-streamer-gemmX-xdma.hjson b/target/snitch_cluster/cfg/snax-streamer-gemmX-xdma.hjson new file mode 100644 index 000000000..b49ff6111 --- /dev/null +++ b/target/snitch_cluster/cfg/snax-streamer-gemmX-xdma.hjson @@ -0,0 +1,206 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Cluster configuration for a simple testbench system. +{ + nr_s1_quadrant: 1, + s1_quadrant: { + nr_clusters: 1, + }, + + cluster: { + name: "snax_streamer_gemmX_xdma_cluster", + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x1000_0000 + cluster_base_offset: 0, // 0x0 + cluster_base_hartid: 0, + addr_width: 48, + data_width: 64, + tcdm: { + size: 128, + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + dma_data_width: 512, + dma_axi_req_fifo_depth: 3, + dma_req_fifo_depth: 3, + // Timing parameters + timing: { + lat_comp_fp32: 3, + lat_comp_fp64: 3, + lat_comp_fp16: 2, + lat_comp_fp16_alt: 2, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 1, + lat_sdotp: 2, + fpu_pipe_config: "BEFORE" + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/snax_streamer_gemmX_core_template" }, + { $ref: "#/snax_xdma_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ] + }, + dram: { + // 0x8000_0000 + address: 2147483648, + // 0x8000_0000 + length: 2147483648 + }, + peripherals: { + clint: { + // 0xffff_0000 + address: 4294901760, + // 0x0000_1000 + length: 4096 + }, + }, + // Templates. + snax_streamer_gemmX_core_template: { + isa: "rv32ima", + xssr: false, + xfrep: false, + xdma: false, + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + snax_acc_cfg: { + snax_acc_name: "snax_streamer_gemmX", + // add a checker here? + // some of the tcdm ports specificed here? + snax_wide_tcdm_ports: 56, + snax_num_rw_csr: 10, + snax_num_ro_csr: 2, + snax_streamer_cfg: {$ref: "#/snax_streamer_gemmX_streamer_template" } + }, + snax_use_custom_ports: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // Enable division/square root unit + // Xdiv_sqrt: true, + }, + dma_core_template: { + isa: "rv32ima", + // Xdiv_sqrt: true, + # isa: "rv32ema", + xdma: true + xssr: false + xfrep: false + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + }, + snax_xdma_core_template: { + isa: "rv32ima", + snax_xdma_cfg: { + reader_buffer: 8, + writer_buffer: 8, + reader_agu_dimension: 3, + writer_agu_dimension: 3, + has_transposer: true, + has_maxpool: true, + has_memset: true + } + xdma: false + xssr: false + xfrep: false + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + }, + // SNAX Streamer Templates + snax_streamer_gemmX_streamer_template :{ + + temporal_addrgen_unit_params: { + loop_dim: [6, 3, 2, 2, 2], + share_temp_addr_gen_loop_bounds: false, + } + + fifo_reader_params: { + fifo_width: [512, 512], + fifo_depth: [2, 2], + } + + fifo_writer_params: { + fifo_width: [512], + fifo_depth: [2], + } + + fifo_reader_writer_params: { + fifo_width: [2048], + fifo_depth: [2], + } + + data_reader_params:{ + tcdm_ports_num: [8, 8], + spatial_bounds: [[8, 8], [8, 8]], + spatial_dim: [2, 2], + element_width: [8, 8], + } + + data_writer_params:{ + tcdm_ports_num: [8], + spatial_bounds: [[8, 8]], + spatial_dim: [2], + element_width: [8], + } + + data_reader_writer_params:{ + tcdm_ports_num: [32], + spatial_bounds: [[8, 8]], + spatial_dim: [2], + element_width: [32], + } + + stationarity: [0,0,0,0,0] + } +} diff --git a/target/snitch_cluster/sw/Makefile b/target/snitch_cluster/sw/Makefile index bcc0e80a1..dcc97e37d 100644 --- a/target/snitch_cluster/sw/Makefile +++ b/target/snitch_cluster/sw/Makefile @@ -15,19 +15,19 @@ else RUNTIME = runtime/rtl endif -SUBDIRS += math $(RUNTIME) snax/mac snax/gemm snax/streamer-gemm snax/streamer-simd snax/data-reshuffler snax/streamer-gemm-conv snax/streamer-gemm-conv-simd apps tests +SUBDIRS += math $(RUNTIME) snax/mac snax/gemm snax/streamer-gemm snax/streamer-simd snax/data-reshuffler snax/streamer-gemm-conv snax/streamer-gemm-conv-simd snax/xdma apps tests .PHONY: all $(SUBDIRS) all: $(SUBDIRS) # Explicit dependency of apps on runtime -apps: math $(RUNTIME) snax/mac snax/gemm snax/streamer-gemm snax/streamer-simd snax/data-reshuffler snax/streamer-gemm-conv snax/streamer-gemm-conv-simd +apps: math $(RUNTIME) snax/mac snax/gemm snax/streamer-gemm snax/streamer-simd snax/data-reshuffler snax/streamer-gemm-conv snax/streamer-gemm-conv-simd snax/xdma $(MAKE) -C $@ TARGET=$(TARGET) # Explicit dependency of tests on runtime -tests: math $(RUNTIME) snax/mac snax/gemm snax/streamer-gemm snax/streamer-simd snax/data-reshuffler snax/streamer-gemm-conv snax/streamer-gemm-conv-simd +tests: math $(RUNTIME) snax/mac snax/gemm snax/streamer-gemm snax/streamer-simd snax/data-reshuffler snax/streamer-gemm-conv snax/streamer-gemm-conv-simd snax/xdma $(MAKE) -C $@ $(TARGET) -runtime/rtl runtime/banshee runtime/rtl-generic math snax/mac snax/gemm snax/streamer-gemm snax/streamer-simd snax/data-reshuffler snax/streamer-gemm-conv snax/streamer-gemm-conv-simd : +runtime/rtl runtime/banshee runtime/rtl-generic math snax/mac snax/gemm snax/streamer-gemm snax/streamer-simd snax/data-reshuffler snax/streamer-gemm-conv snax/streamer-gemm-conv-simd snax/xdma: $(MAKE) -C $@ $(TARGET) diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile index 409ee38d7..30f14c93c 100644 --- a/target/snitch_cluster/sw/apps/Makefile +++ b/target/snitch_cluster/sw/apps/Makefile @@ -34,6 +34,7 @@ SUBDIRS += snax-alu SUBDIRS += snax-data-reshuffler SUBDIRS += snax-streamer-gemm-conv-simd SUBDIRS += snax-streamer-gemm-conv +SUBDIRS += snax-xdma-memset .PHONY: all clean $(SUBDIRS) diff --git a/target/snitch_cluster/sw/apps/snax-xdma-memset/Makefile b/target/snitch_cluster/sw/apps/snax-xdma-memset/Makefile new file mode 100644 index 000000000..a076594c4 --- /dev/null +++ b/target/snitch_cluster/sw/apps/snax-xdma-memset/Makefile @@ -0,0 +1,14 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Yunhao Deng + +APP = snax-xdma-memset +SRCS = src/snax-xdma-memset.c +INCDIRS += ../../snax/xdma/include +INCDIRS += data + +RISCV_LDFLAGS += ../../snax/xdma/build/snax-xdma-lib.o + +include ../common.mk diff --git a/target/snitch_cluster/sw/apps/snax-xdma-memset/data/data.h b/target/snitch_cluster/sw/apps/snax-xdma-memset/data/data.h new file mode 100644 index 000000000..856d0eb8d --- /dev/null +++ b/target/snitch_cluster/sw/apps/snax-xdma-memset/data/data.h @@ -0,0 +1,7 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Yunhao Deng + +#include diff --git a/target/snitch_cluster/sw/apps/snax-xdma-memset/src/snax-xdma-memset.c b/target/snitch_cluster/sw/apps/snax-xdma-memset/src/snax-xdma-memset.c new file mode 100644 index 000000000..2b93e9309 --- /dev/null +++ b/target/snitch_cluster/sw/apps/snax-xdma-memset/src/snax-xdma-memset.c @@ -0,0 +1,134 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Yunhao Deng + +#include "data.h" +#include "snax-xdma-lib.h" +#include "snrt.h" + +int main() { + // Set err value for checking + int err = 0; + + // Obtain the start address of the TCDM memory + uint8_t *tcdm_baseaddress = (uint8_t *)snrt_l1_next(); + uint8_t *tcdm_0 = tcdm_baseaddress; + uint8_t *tcdm_16 = tcdm_baseaddress + 0x4000 * sizeof(uint8_t); + uint8_t *tcdm_32 = tcdm_baseaddress + 0x8000 * sizeof(uint8_t); + uint8_t *tcdm_48 = tcdm_baseaddress + 0xc000 * sizeof(uint8_t); + uint8_t *tcdm_64 = tcdm_baseaddress + 0x10000 * sizeof(uint8_t); + uint8_t *tcdm_80 = tcdm_baseaddress + 0x14000 * sizeof(uint8_t); + uint8_t *tcdm_96 = tcdm_baseaddress + 0x18000 * sizeof(uint8_t); + uint8_t *tcdm_112 = tcdm_baseaddress + 0x1c000 * sizeof(uint8_t); + + // Using xdma core only + if (snrt_cluster_core_idx() == snrt_cluster_compute_core_num() - 1) { + // The xdma core is the last compute core in the cluster + if (xdma_memcpy_1d(tcdm_0, tcdm_0, 0x4000 * sizeof(uint8_t)) != 0) { + printf("Error in xdma agu configuration\n"); + err++; + } else { + printf("The xdma agu is configured\n"); + } + + uint32_t ext_param[1] = {0xFFFFFFFF}; + if (xdma_enable_dst_ext(0, ext_param) != 0) { + printf("Error in enabling xdma extension 0\n"); + err++; + } else { + printf("The xdma extension 0 is enabled\n"); + } + + if (xdma_disable_dst_ext(1) != 0) { + printf("Error in disabling xdma extension 1\n"); + err++; + } else { + printf("The xdma extension 1 is disabled\n"); + } + + if (xdma_disable_dst_ext(2) != 0) { + printf("Error in disabling xdma extension 2\n"); + err++; + } else { + printf("The xdma extension 2 is disabled\n"); + } + + if (err != 0) { + return err; + } + + int task_id = xdma_start(); + printf( + "The xdma is started, setting memory region to 0xFF. The task id " + "is %d\n", + task_id); + xdma_wait(task_id); + + printf("The xdma is finished\n"); + // Check the data + for (int i = 0; i < 0x4000; i++) { + if (tcdm_0[i] != 0xFF) { + printf("The memset of 0KB - 16KB is not correct\n"); + return -1; + } + } + printf("The memset of 0KB - 16KB is correct\n"); + + // Setting the 4K-12K region back to 0 + if (xdma_memcpy_1d(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t), + 0x2000 * sizeof(uint8_t)) != 0) { + printf("Error in xdma agu configuration\n"); + err++; + } else { + printf("The xdma agu is configured\n"); + } + + ext_param[0] = 0; + if (xdma_enable_dst_ext(0, ext_param) != 0) { + printf("Error in enabling xdma extension 0\n"); + err++; + } else { + printf("The xdma extension 0 is enabled\n"); + } + + if (err != 0) { + return err; + } + + task_id = xdma_start(); + printf( + "The xdma is started, setting memory region to 0x00. The task id " + "is %d\n", + task_id); + xdma_wait(task_id); + + printf("The xdma is finished\n"); + // Check the data + for (int i = 0; i < 0x1000; i++) { + if (tcdm_0[i] != 0xFF) { + printf("Error in memset (region 0)\n"); + return -1; + } + } + for (int i = 0x1000; i < 0x3000; i++) { + if (tcdm_0[i] != 0x00) { + printf("The memset is incorrect (region 1)\n"); + return -1; + } + } + for (int i = 0x3000; i < 0x4000; i++) { + if (tcdm_0[i] != 0xFF) { + printf("The memset is incorrect (region 2)\n"); + return -1; + } + } + printf("The memset of 4KB - 12KB is correct\n"); + + } else { + printf("Core %d is not xdma core. \n", snrt_cluster_core_idx()); + } + + return 0; +} diff --git a/target/snitch_cluster/sw/snax-xdma-run.yaml b/target/snitch_cluster/sw/snax-xdma-run.yaml new file mode 100644 index 000000000..5e96f3bd2 --- /dev/null +++ b/target/snitch_cluster/sw/snax-xdma-run.yaml @@ -0,0 +1,9 @@ +# Copyright 2024 KU Leuven. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Yunhao Deng + + +runs: + - elf: apps/snax-xdma-memset/build/snax-xdma-memset.elf diff --git a/target/snitch_cluster/sw/snax/xdma/Makefile b/target/snitch_cluster/sw/snax/xdma/Makefile new file mode 100644 index 000000000..616c69a0b --- /dev/null +++ b/target/snitch_cluster/sw/snax/xdma/Makefile @@ -0,0 +1,78 @@ +# Copyright 2023 KU Leuven. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Xiaoling Yi + +# Usage of absolute paths is required to externally include +# this Makefile from multiple different locations + +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +include $(MK_DIR)/../../toolchain.mk + +# Banshee runtime is not supported +ifeq ($(SELECT_RUNTIME), rtl-generic) +RUNTIME_DIR := rtl-generic +else +RUNTIME_DIR := rtl +endif + +################ +## Directories # +################ + +# Fixed paths in repository tree +ROOT = $(abspath $(MK_DIR)/../../../../..) +SNRT_DIR = $(ROOT)/sw/snRuntime +## Paths relative to the runtime including this Makefile +BUILDDIR = $(abspath build) +SRC_DIR = $(abspath src) + +#################### +## Build variables # +#################### + +INCDIRS += $(abspath include) +INCDIRS += $(SNRT_DIR)/src +INCDIRS += $(SNRT_DIR)/api +INCDIRS += $(SNRT_DIR)/src/omp +INCDIRS += $(SNRT_DIR)/api/omp +INCDIRS += $(SNRT_DIR)/vendor/riscv-opcodes +INCDIRS += $(ROOT)/target/snitch_cluster/sw/runtime/common + +# math.h needed by snRuntime + +INCDIRS += $(SNRT_DIR)/../math/arch/riscv64/ +INCDIRS += $(SNRT_DIR)/../math/arch/generic +INCDIRS += $(SNRT_DIR)/../math/src/include +INCDIRS += $(SNRT_DIR)/../math/src/internal +INCDIRS += $(SNRT_DIR)/../math/include/bits +INCDIRS += $(SNRT_DIR)/../math/include + +INCDIRS += $(ROOT)/target/snitch_cluster/sw/runtime/$(RUNTIME_DIR)/src +INCDIRS += $(ROOT)/target/snitch_cluster/sw/runtime/$(RUNTIME_DIR)/include + +############ +## Outputs # +############ + +OBJS = $(BUILDDIR)/snax-xdma-lib.o +ALL_OUTPUTS = $(OBJS) + + +########## +## Rules # +########## + +.PHONY: all +all: $(ALL_OUTPUTS) + +.PHONY: clean +clean: + rm -rf $(BUILDDIR) + +$(BUILDDIR): + mkdir -p $@ + +$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ diff --git a/target/snitch_cluster/sw/snax/xdma/include/snax-xdma-csr-addr.h b/target/snitch_cluster/sw/snax/xdma/include/snax-xdma-csr-addr.h new file mode 100644 index 000000000..be9caaa81 --- /dev/null +++ b/target/snitch_cluster/sw/snax/xdma/include/snax-xdma-csr-addr.h @@ -0,0 +1,32 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Yunhao Deng + +// This file is (will be) generated by scala, do not modify it manually + +#define XDMA_BASE_ADDR 960 +#define XDMA_SRC_ADDR_PTR_LSB XDMA_BASE_ADDR +#define XDMA_SRC_ADDR_PTR_MSB XDMA_SRC_ADDR_PTR_LSB + 1 +#define XDMA_SRC_DIM 3 +#define XDMA_SRC_BOUND_PTR XDMA_SRC_ADDR_PTR_MSB + 1 +#define XDMA_SRC_STRIDE_PTR XDMA_SRC_BOUND_PTR + XDMA_SRC_DIM +#define XDMA_SRC_EXT_CSR_PTR XDMA_SRC_STRIDE_PTR + XDMA_SRC_DIM +#define XDMA_SRC_EXT_NUM 0 +#define XDMA_SRC_EXT_CSR_NUM 0 +#define XDMA_SRC_EXT_CUSTOM_CSR_NUM \ + {} +#define XDMA_DST_ADDR_PTR_LSB XDMA_SRC_EXT_CSR_PTR + XDMA_SRC_EXT_CSR_NUM +#define XDMA_DST_ADDR_PTR_MSB XDMA_DST_ADDR_PTR_LSB + 1 +#define XDMA_DST_DIM 3 +#define XDMA_DST_BOUND_PTR XDMA_DST_ADDR_PTR_MSB + 1 +#define XDMA_DST_STRIDE_PTR XDMA_DST_BOUND_PTR + XDMA_DST_DIM +#define XDMA_DST_EXT_CSR_PTR XDMA_DST_STRIDE_PTR + XDMA_DST_DIM +#define XDMA_DST_EXT_NUM 3 +#define XDMA_DST_EXT_CSR_NUM 5 +#define XDMA_DST_EXT_CUSTOM_CSR_NUM \ + { 1, 1, 0 } +#define XDMA_START_PTR XDMA_DST_EXT_CSR_PTR + XDMA_DST_EXT_CSR_NUM +#define XDMA_COMMIT_TASK_PTR XDMA_START_PTR + 1 +#define XDMA_FINISH_TASK_PTR XDMA_COMMIT_TASK_PTR + 1 diff --git a/target/snitch_cluster/sw/snax/xdma/include/snax-xdma-lib.h b/target/snitch_cluster/sw/snax/xdma/include/snax-xdma-lib.h new file mode 100644 index 000000000..98c440b07 --- /dev/null +++ b/target/snitch_cluster/sw/snax/xdma/include/snax-xdma-lib.h @@ -0,0 +1,34 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Yunhao Deng + +#include +#include "snrt.h" +#include "stdint.h" + +#pragma once + +// Define the CSR address of xdma, should be generated by scala +#include "snax-xdma-csr-addr.h" + +// Set CSR for xdma +int32_t xdma_memcpy_nd(uint8_t *src, uint8_t *dst, uint32_t unit_size_src, + uint32_t unit_size_dst, uint32_t dim_src, + uint32_t dim_dst, uint32_t *stride_src, + uint32_t *stride_dst, uint32_t *bound_src, + uint32_t *bound_dst); +int32_t xdma_memcpy_1d(uint8_t *src, uint8_t *dst, uint32_t size); +int32_t xdma_enable_src_ext(uint8_t ext, uint32_t *csr_value); +int32_t xdma_disable_src_ext(uint8_t ext); +int32_t xdma_enable_dst_ext(uint8_t ext, uint32_t *csr_value); +int32_t xdma_disable_dst_ext(uint8_t ext); + +// Start xdma +uint32_t xdma_start(); + +// Check if xdma is finished +bool xdma_is_finished(uint32_t task_id); + +void xdma_wait(uint32_t task_id); diff --git a/target/snitch_cluster/sw/snax/xdma/src/snax-xdma-lib.c b/target/snitch_cluster/sw/snax/xdma/src/snax-xdma-lib.c new file mode 100644 index 000000000..71620e50b --- /dev/null +++ b/target/snitch_cluster/sw/snax/xdma/src/snax-xdma-lib.c @@ -0,0 +1,377 @@ +// Copyright 2024 KU Leuven. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Yunhao Deng + +#include "snax-xdma-lib.h" +#include +#include "snrt.h" +#include "stdint.h" + +#define XDMA_DEBUG +#ifdef XDMA_DEBUG +#define XDMA_DEBUG_PRINT(...) printf(__VA_ARGS__) +#else +#define XDMA_DEBUG_PRINT(...) +#endif + +// Soft switch for CSR to make it support dynamic addressing +// The function can address 32 CSR registers starting from 960 + +uint32_t read_csr_soft_switch(uint32_t csr_address) { + XDMA_DEBUG_PRINT("Data from CSR location %d will be provided\n", + csr_address); + uint32_t value; + switch (csr_address) { + case 960: + return read_csr(960); + case 961: + return read_csr(961); + case 962: + return read_csr(962); + case 963: + return read_csr(963); + case 964: + return read_csr(964); + case 965: + return read_csr(965); + case 966: + return read_csr(966); + case 967: + return read_csr(967); + case 968: + return read_csr(968); + case 969: + return read_csr(969); + case 970: + return read_csr(970); + break; + case 971: + return read_csr(971); + case 972: + return read_csr(972); + case 973: + return read_csr(973); + case 974: + return read_csr(974); + break; + case 975: + return read_csr(975); + case 976: + return read_csr(976); + case 977: + return read_csr(977); + case 978: + return read_csr(978); + case 979: + return read_csr(979); + case 980: + return read_csr(980); + case 981: + return read_csr(981); + case 982: + return read_csr(982); + case 983: + return read_csr(983); + case 984: + return read_csr(984); + case 985: + return read_csr(985); + case 986: + return read_csr(986); + case 987: + return read_csr(987); + break; + case 988: + return read_csr(988); + case 989: + return read_csr(989); + case 990: + return read_csr(990); + case 991: + return read_csr(991); + } + return 0; +} + +void write_csr_soft_switch(uint32_t csr_address, uint32_t value) { + XDMA_DEBUG_PRINT("%x is written at CSR location %d\n", value, csr_address); + switch (csr_address) { + case 960: + write_csr(960, value); + break; + case 961: + write_csr(961, value); + break; + case 962: + write_csr(962, value); + break; + case 963: + write_csr(963, value); + break; + case 964: + write_csr(964, value); + break; + case 965: + write_csr(965, value); + break; + case 966: + write_csr(966, value); + break; + case 967: + write_csr(967, value); + break; + case 968: + write_csr(968, value); + break; + case 969: + write_csr(969, value); + break; + case 970: + write_csr(970, value); + break; + case 971: + write_csr(971, value); + break; + case 972: + write_csr(972, value); + break; + case 973: + write_csr(973, value); + break; + case 974: + write_csr(974, value); + break; + case 975: + write_csr(975, value); + break; + case 976: + write_csr(976, value); + break; + case 977: + write_csr(977, value); + break; + case 978: + write_csr(978, value); + break; + case 979: + write_csr(979, value); + break; + case 980: + write_csr(980, value); + break; + case 981: + write_csr(981, value); + break; + case 982: + write_csr(982, value); + break; + case 983: + write_csr(983, value); + break; + case 984: + write_csr(984, value); + break; + case 985: + write_csr(985, value); + break; + case 986: + write_csr(986, value); + break; + case 987: + write_csr(987, value); + break; + case 988: + write_csr(988, value); + break; + case 989: + write_csr(989, value); + break; + case 990: + write_csr(990, value); + break; + case 991: + write_csr(991, value); + break; + } +} + +int32_t xdma_memcpy_nd(uint8_t* src, uint8_t* dst, uint32_t unit_size_src, + uint32_t unit_size_dst, uint32_t dim_src, + uint32_t dim_dst, uint32_t* stride_src, + uint32_t* stride_dst, uint32_t* bound_src, + uint32_t* bound_dst) { + write_csr_soft_switch(XDMA_SRC_ADDR_PTR_LSB, (uint32_t)(uint64_t)src); + write_csr_soft_switch(XDMA_SRC_ADDR_PTR_MSB, + (uint32_t)((uint64_t)src >> 32)); + + write_csr_soft_switch(XDMA_DST_ADDR_PTR_LSB, (uint32_t)(uint64_t)dst); + write_csr_soft_switch(XDMA_DST_ADDR_PTR_MSB, + (uint32_t)((uint64_t)dst >> 32)); + // Rule check + // unit size only support 8 bytes or n * 64 bytes + XDMA_DEBUG_PRINT("unit size src: %d\n", unit_size_src); + XDMA_DEBUG_PRINT("unit size dst: %d\n", unit_size_dst); + + if ((unit_size_src % 64 != 0) && (unit_size_src != 8)) { + XDMA_DEBUG_PRINT("unit size src error\n"); + return -1; + } + if ((unit_size_dst % 64 != 0) && (unit_size_dst != 8)) { + XDMA_DEBUG_PRINT("unit size dst error\n"); + return -2; + } + // Src size and dst size should be equal + uint32_t src_size = unit_size_src; + for (uint32_t i = 0; i < dim_src - 1; i++) { + src_size *= bound_src[i]; + } + uint32_t dst_size = unit_size_dst; + for (uint32_t i = 0; i < dim_dst - 1; i++) { + dst_size *= bound_dst[i]; + } + if (src_size != dst_size) { + XDMA_DEBUG_PRINT("src size and dst size not equal\n"); + return -3; + } + + // Dimension 1 at src + uint32_t i = 0; + if (unit_size_src % 64 == 0) { + write_csr_soft_switch(XDMA_SRC_STRIDE_PTR + i, 8); + write_csr_soft_switch(XDMA_SRC_BOUND_PTR + i, unit_size_src >> 3); + i++; + } + // Dimension 2 to n at src + for (uint32_t j = 0; j < dim_src - 1; j++) { + if (i + j >= XDMA_SRC_DIM) { + XDMA_DEBUG_PRINT("Source dimension is too high for xdma\n"); + return -4; + } + write_csr_soft_switch(XDMA_SRC_BOUND_PTR + i + j, bound_src[j]); + write_csr_soft_switch(XDMA_SRC_STRIDE_PTR + i + j, stride_src[j]); + } + // Dimension n to MAX at src + for (uint32_t j = dim_src - 1; (i + j) < XDMA_SRC_DIM; j++) { + write_csr_soft_switch(XDMA_SRC_BOUND_PTR + i + j, 1); + write_csr_soft_switch(XDMA_SRC_STRIDE_PTR + i + j, 0); + } + + // Dimension 1 at dst + i = 0; + if (unit_size_dst % 64 == 0) { + write_csr_soft_switch(XDMA_DST_STRIDE_PTR + i, 8); + write_csr_soft_switch(XDMA_DST_BOUND_PTR + i, unit_size_dst >> 3); + i++; + } + // Dimension 2 to n at dst + for (uint32_t j = 0; j < dim_dst - 1; j++) { + if (i + j >= XDMA_DST_DIM) { + XDMA_DEBUG_PRINT("Destination dimension is too high for xdma\n"); + return -5; + } + write_csr_soft_switch(XDMA_DST_BOUND_PTR + i + j, bound_dst[j]); + write_csr_soft_switch(XDMA_DST_STRIDE_PTR + i + j, stride_dst[j]); + } + // Dimension n to MAX at dst + for (uint32_t j = dim_dst - 1; (i + j) < XDMA_DST_DIM; j++) { + write_csr_soft_switch(XDMA_DST_BOUND_PTR + i + j, 1); + write_csr_soft_switch(XDMA_DST_STRIDE_PTR + i + j, 0); + } + return 0; +} + +int32_t xdma_memcpy_1d(uint8_t* src, uint8_t* dst, uint32_t size) { + return xdma_memcpy_nd(src, dst, size, size, 1, 1, (uint32_t*)NULL, + (uint32_t*)NULL, (uint32_t*)NULL, (uint32_t*)NULL); +} + +// xdma extension interface +int32_t xdma_enable_src_ext(uint8_t ext, uint32_t* csr_value) { + if (ext >= XDMA_SRC_EXT_NUM) { + return -1; + } + uint8_t custom_csr_list[XDMA_SRC_EXT_NUM] = XDMA_SRC_EXT_CUSTOM_CSR_NUM; + uint32_t csr_offset = XDMA_SRC_EXT_CSR_PTR; + for (uint8_t i = 0; i < ext; i++) { + csr_offset += custom_csr_list[i] + 1; + } + + // Not bypass the xdma extension -> set the first CSR to 0 + write_csr_soft_switch(csr_offset, 0); + csr_offset++; + for (uint8_t i = 0; i < custom_csr_list[ext]; i++) { + write_csr_soft_switch(csr_offset + i, csr_value[i]); + } + return 0; +} +int32_t xdma_enable_dst_ext(uint8_t ext, uint32_t* csr_value) { + if (ext >= XDMA_DST_EXT_NUM) { + return -1; + } + uint8_t custom_csr_list[XDMA_DST_EXT_NUM] = XDMA_DST_EXT_CUSTOM_CSR_NUM; + uint32_t csr_offset = XDMA_DST_EXT_CSR_PTR; + for (uint8_t i = 0; i < ext; i++) { + csr_offset += custom_csr_list[i] + 1; + } + + // Not bypass the xdma extension -> set the first CSR to 0 + write_csr_soft_switch(csr_offset, 0); + csr_offset++; + for (uint8_t i = 0; i < custom_csr_list[ext]; i++) { + write_csr_soft_switch(csr_offset + i, csr_value[i]); + } + return 0; +} + +int32_t xdma_disable_src_ext(uint8_t ext) { + if (ext >= XDMA_SRC_EXT_NUM) { + return -1; + } + uint8_t custom_csr_list[XDMA_SRC_EXT_NUM] = XDMA_SRC_EXT_CUSTOM_CSR_NUM; + uint32_t csr_offset = XDMA_SRC_EXT_CSR_PTR; + for (uint8_t i = 0; i < ext; i++) { + csr_offset += custom_csr_list[i] + 1; + } + + // Bypass the xdma extension -> set the first CSR to 1 + write_csr_soft_switch(csr_offset, 1); + return 0; +} + +int32_t xdma_disable_dst_ext(uint8_t ext) { + if (ext >= XDMA_DST_EXT_NUM) { + return -1; + } + uint8_t custom_csr_list[XDMA_DST_EXT_NUM] = XDMA_DST_EXT_CUSTOM_CSR_NUM; + uint32_t csr_offset = XDMA_DST_EXT_CSR_PTR; + for (uint8_t i = 0; i < ext; i++) { + csr_offset += custom_csr_list[i] + 1; + } + + // Bypass the xdma extension -> set the first CSR to 1 + write_csr_soft_switch(csr_offset, 1); + return 0; +} + +// Start xdma +uint32_t xdma_start() { + int ret = read_csr_soft_switch(XDMA_COMMIT_TASK_PTR); + write_csr_soft_switch(XDMA_START_PTR, 1); + while (read_csr_soft_switch(XDMA_COMMIT_TASK_PTR) == ret) { + // Wait for xdma to start + } + return read_csr_soft_switch(XDMA_COMMIT_TASK_PTR); +} + +// Check if xdma is finished +bool xdma_is_finished(uint32_t task_id) { + return read_csr_soft_switch(XDMA_FINISH_TASK_PTR) >= task_id; +} + +void xdma_wait(uint32_t task_id) { + while (!xdma_is_finished(task_id)) { + // Wait for xdma to finish + } +} diff --git a/util/wrappergen/wrappergen.py b/util/wrappergen/wrappergen.py index 23f331f6e..c376f5f35 100755 --- a/util/wrappergen/wrappergen.py +++ b/util/wrappergen/wrappergen.py @@ -56,7 +56,8 @@ def gen_file(cfg, tpl, target_path: str, file_name: str) -> None: def gen_chisel_file(chisel_path, chisel_param, gen_path): cmd = f" cd {chisel_path} && \ mill Snax.runMain {chisel_param} {gen_path}" - os.system(cmd) + if os.system(cmd) != 0: + raise ChildProcessError('Chisel generation error. ') return @@ -213,7 +214,8 @@ def main(): acc_cfgs[i]["tcdm_depth"] = tcdm_depth tcdm_num_banks = cfg["cluster"]["tcdm"]["banks"] acc_cfgs[i]["tcdm_num_banks"] = tcdm_num_banks - tcdm_addr_width = tcdm_num_banks * tcdm_depth * (tcdm_data_width // 8) + tcdm_addr_width = tcdm_num_banks * \ + tcdm_depth * (tcdm_data_width // 8) tcdm_addr_width = int(math.log2(tcdm_addr_width)) acc_cfgs[i]["tcdm_addr_width"] = tcdm_addr_width # Chisel parameter tag names @@ -249,7 +251,8 @@ def main(): file_name=file_name, ) - rtl_target_path = args.gen_path + acc_cfgs[i]["snax_acc_name"] + "/" + rtl_target_path = args.gen_path + \ + acc_cfgs[i]["snax_acc_name"] + "/" # This is for RTL wrapper and chisel generation # This first one generates the CSR manager wrapper @@ -305,6 +308,43 @@ def main(): else: print("Skipping accelerator generation!") + # Generate xdma for the whole cluster + snax_xdma_cfg = None + for i in range(num_cores): + if "snax_xdma_cfg" in cfg_cores[i]: + snax_xdma_cfg = cfg_cores[i]["snax_xdma_cfg"] + if (snax_xdma_cfg is not None): + tpl_rtl_wrapper_file = args.tpl_path + "snax_xdma_wrapper.sv.tpl" + + tpl_rtl_wrapper = get_template(tpl_rtl_wrapper_file) + + gen_file( + cfg=cfg["cluster"], + tpl=tpl_rtl_wrapper, + target_path=args.gen_path + cfg["cluster"]["name"] + "_xdma/", + file_name=cfg["cluster"]["name"] + "_xdma_wrapper.sv", + ) + + print(args.gen_path) + gen_chisel_file( + chisel_path=args.chisel_path, + chisel_param="snax.xdma.xdmaTop.xdmaTopGen", + gen_path=" --clusterName " + str(cfg["cluster"]["name"]) + + " --tcdmDataWidth " + str(cfg["cluster"]["data_width"]) + + " --axiDataWidth " + str(cfg["cluster"]["dma_data_width"]) + + " --addressWidth " + str(cfg["cluster"]["addr_width"]) + + " --tcdmSize " + str(cfg["cluster"]["tcdm"]["size"]) + + " --readerDimension " + str(snax_xdma_cfg["reader_agu_dimension"]) + + " --writerDimension " + str(snax_xdma_cfg["writer_agu_dimension"]) + + " --readerBufferDepth " + str(snax_xdma_cfg["reader_buffer"]) + + " --writerBufferDepth " + str(snax_xdma_cfg["writer_buffer"]) + + (" --HasMemset " if snax_xdma_cfg["has_memset"] else "") + + (" --HasMaxPool " if snax_xdma_cfg["has_maxpool"] else "") + + (" --HasTransposer " if snax_xdma_cfg["has_transposer"] else "") + + " --target-dir " + args.gen_path + + cfg["cluster"]["name"] + "_xdma/" + ) + # Generation of testharness test_target_path = args.test_path file_name = "testharness.sv"