From 74594d2885d2df3e5035805e0ce6f731e0d723f5 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 3 Apr 2024 13:52:18 +0200
Subject: [PATCH] trace: Add functionality to parse DMA trace

---
 target/common/common.mk |   4 +-
 util/trace/gen_trace.py | 151 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 142 insertions(+), 13 deletions(-)

diff --git a/target/common/common.mk b/target/common/common.mk
index 981ab8ab3d..1bc2a6865f 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -233,8 +233,8 @@ clean-perf:
 clean-visual-trace:
 	rm -f $(VISUAL_TRACE)
 
-$(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
-	$(DASM) < $< | $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
+$(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json dma_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
+	$(DASM) < $< | $(GENTRACE_PY) --permissive --dma-trace $(SIM_DIR)/dma_trace_$*.log --dump-hart-perf $(LOGS_DIR)/hart_$*_perf.json --dump-dma-perf $(LOGS_DIR)/dma_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
 
 # Generate source-code interleaved traces for all harts. Reads the binary from
 # the logs/.rtlbinary file that is written at start of simulation in the vsim script
diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index 9a838cd69c..61b21d074f 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -23,6 +23,12 @@
 
 Performance metrics are appended at the end of the generated trace
 and can optionally be dumped to a separate JSON file.
+
+It also computes various performance metrics for every DMA transfer,
+provided that the Snitch core is equipped with a tightly-coupled DMA
+engine, and the DMA trace logged during simulation
+(see `axi_dma_backend.sv`) is fed to the tool. DMA performance
+metrics are dumped to a separate JSON file.
 """
 
 # TODO: OPER_TYPES and FPU_OPER_TYPES could break: optimization might alter enum mapping
@@ -32,9 +38,10 @@
 import re
 import argparse
 import json
+import ast
 from ctypes import c_int32, c_uint32
 from collections import deque, defaultdict
-import pathlib
+from pathlib import Path
 
 EXTRA_WB_WARN = 'WARNING: {} transactions still in flight for {}.'
 
@@ -332,7 +339,7 @@
 def load_opcodes():
     global _cached_opcodes
     opcode_file_name = 'opcodes-flt-occamy_CUSTOM.csv'
-    opcode_file_path = pathlib.Path(__file__).parent.absolute() / opcode_file_name
+    opcode_file_path = Path(__file__).parent.absolute() / opcode_file_name
 
     _cached_opcodes = {}
     with open(opcode_file_path, 'r') as f:
@@ -510,6 +517,109 @@ def flt_lit(num: int, fmt: int, width: int = 6, vlen: int = 1) -> str:
         return floats[0]
 
 
+# -------------------- DMA --------------------
+
+
+# We always assume dma_trans contains at least one incomplete placeholder DMA transaction.
+# This incomplete transaction contains default settings. Only upon a DMCPY* instruction
+# is the size of the transaction known, completing the transaction. At that point, a new
+# incomplete transaction is created, inheriting the configuration settings from the previous
+# transaction, which may or may not be overriden before the next DMCPY*.
+def update_dma(insn, extras, dma_trans):
+    # Extract instruction mnemonic from full instruction decoding (includes operand registers)
+    MNEMONIC_REGEX = r'^([\w.]+)\s'
+    match = re.match(MNEMONIC_REGEX, insn)
+    if match:
+        mnemonic = match.group(1)
+        # Process DMA instruction
+        if mnemonic in ['dmsrc', 'dmdst', 'dmstr']:
+            pass
+        elif mnemonic == 'dmrep':
+            dma_trans[-1]['rep'] = extras['opa']
+        elif mnemonic in ['dmcpy', 'dmcpyi']:
+            # Create new placeholder transaction to inherit current DMA settings
+            dma_trans.append(dma_trans[-1].copy())
+            # Set size of the transaction
+            dma_trans[-2]['size'] = extras['opa']
+            # Override repetition count if the transaction is configured to be 1D
+            config = extras['rs2']
+            enable_2d = (config & 2) >> 1
+            if not enable_2d:
+                dma_trans[-2]['rep'] = 1
+
+
+def eval_dma_metrics(dma_trans, dma_trace):
+    dma_trace = Path(dma_trace)
+    if dma_trace.exists():
+        with open(dma_trace, 'r') as f:
+            # Initialize variables
+            compl_transfers = []
+            outst_transfers = []
+            req_transfer_idx = 0
+            req_bytes = 0
+            # Iterate lines in DMA trace
+            for line in f.readlines():
+                dma = ast.literal_eval(line)
+                if 'backend_burst_req_valid' in dma:
+                    # When the first burst in a transfer is granted, we record a new transfer in
+                    # the outstanding transfers queue, with the information obtained from the core
+                    # trace. We record the number of bytes moved by each burst in a transfer, and
+                    # compare the total to the number of bytes moved by the transfer, to count how
+                    # many bursts belong to the current DMA transfer (a number which is difficult
+                    # to pre-compute from the core trace as it depends on address alignments, etc.)
+                    if dma['backend_burst_req_valid'] and dma['backend_burst_req_ready']:
+                        if req_bytes == 0:
+                            n_bytes = dma_trans[req_transfer_idx]['rep'] * \
+                                    dma_trans[req_transfer_idx]['size']
+                            outst_transfers.append({'tstart': dma['time'],
+                                                    'exp_bursts': 0,
+                                                    'rec_bursts': 0,
+                                                    'bytes': n_bytes})
+                        req_bytes += dma['backend_burst_req_num_bytes']
+                        outst_transfers[-1]['exp_bursts'] += 1
+                    # We move on to the next transfer when the bytes requested by the previous
+                    # bursts match the current transfer size.
+                    if req_bytes == outst_transfers[-1]['bytes']:
+                        req_bytes = 0
+                        req_transfer_idx += 1
+                    # Upon a burst completion, we increment the received bursts count. When this
+                    # count matches the expected bursts count of the current transfer we record the
+                    # end time of the transfer and promote the transfer from the outstanding to the
+                    # completed transfers' queue.
+                    if dma['transfer_completed']:
+                        outst_transfers[0]['rec_bursts'] += 1
+                        if outst_transfers[0]['rec_bursts'] == outst_transfers[0]['exp_bursts']:
+                            outst_transfers[0]['tend'] = dma['time']
+                            compl_transfer = outst_transfers.pop(0)
+                            compl_transfer.pop('exp_bursts')
+                            compl_transfer.pop('rec_bursts')
+                            compl_transfers.append(compl_transfer)
+            # Calculate bandwidth of individual transfers
+            for transfer in compl_transfers:
+                transfer['cycles'] = transfer['tend'] - transfer['tstart']
+                transfer['bw'] = transfer['bytes'] / transfer['cycles']
+            # Calculate aggregate bandwidth: total number of bytes transferred while any transfer is
+            # active (accounts for overlaps between transfers).
+            prev_trans_end = 0
+            active_cycles = 0
+            n_bytes = 0
+            for transfer in compl_transfers:
+                # Calculate active cycles, without double-counting overlaps
+                curr_trans_start, curr_trans_end = transfer['tstart'], transfer['tend']
+                if curr_trans_start > prev_trans_end:
+                    active_cycles += curr_trans_end - curr_trans_start
+                else:
+                    active_cycles += curr_trans_end - prev_trans_end
+                prev_trans_end = curr_trans_end
+                # Calculate total number of bytes
+                n_bytes += transfer['bytes']
+            dma_metrics = {}
+            if active_cycles != 0:
+                dma_metrics['aggregate_bw'] = n_bytes / active_cycles
+            dma_metrics['transfers'] = compl_transfers
+            return dma_metrics
+
+
 # -------------------- FPU Sequencer --------------------
 
 
@@ -772,7 +882,8 @@ def annotate_insn(
     annot_fseq_offl:
     bool = False,  # Annotate whenever core offloads to CPU on own line
     force_hex_addr: bool = True,
-    permissive: bool = True
+    permissive: bool = True,
+    dma_trans: list = []
 ) -> (str, tuple, bool
       ):  # Return time info, whether trace line contains no info, and fseq_len
     match = re.search(TRACE_IN_REGEX, line.strip('\n'))
@@ -801,6 +912,7 @@ def annotate_insn(
                 insn, pc_str = ('', '')
             else:
                 perf_metrics[-1]['snitch_issues'] += 1
+            update_dma(insn, extras, dma_trans)
         # Annotate sequencer
         elif extras['source'] == TRACE_SRCES['sequencer']:
             if extras['cbuf_push']:
@@ -961,12 +1073,20 @@ def main():
         '--permissive',
         action='store_true',
         help='Ignore some state-related issues when they occur')
-    parser.add_argument('-d',
-                        '--dump-perf',
-                        nargs='?',
-                        metavar='file',
-                        type=argparse.FileType('w'),
-                        help='Dump performance metrics as json text.')
+    parser.add_argument(
+        '--dma-trace',
+        help='Path to a DMA trace file'
+    )
+    parser.add_argument(
+        '--dump-hart-perf',
+        nargs='?',
+        type=argparse.FileType('w'),
+        help='Dump hart performance metrics as json text.'
+    )
+    parser.add_argument(
+        '--dump-dma-perf',
+        help='Dump DMA performance metrics as json text.'
+    )
 
     args = parser.parse_args()
     line_iter = iter(args.infile.readline, b'')
@@ -983,6 +1103,7 @@ def main():
             'cfg_buf': deque(),
             'curr_cfg': None
         }
+        dma_trans = [{'rep': 1}]
         perf_metrics = [
             defaultdict(int)
         ]  # all values initially 0, also 'start' time of measurement 0
@@ -1008,10 +1129,18 @@ def main():
         print('\n## Performance metrics', file=file)
         for idx in range(len(perf_metrics)):
             print('\n' + fmt_perf_metrics(perf_metrics, idx, not args.allkeys), file=file)
+        # Emit DMA metrics
+        if args.dma_trace:
+            dma_metrics = eval_dma_metrics(dma_trans, args.dma_trace)
 
-    if args.dump_perf:
-        with args.dump_perf as file:
+    # Dump hart performance metrics to JSON file
+    if args.dump_hart_perf:
+        with args.dump_hart_perf as file:
             file.write(json.dumps(perf_metrics, indent=4))
+    # Dump DMA performance metrics to JSON file
+    if args.dump_dma_perf and dma_metrics is not None:
+        with open(args.dump_dma_perf, 'w') as file:
+            file.write(json.dumps(dma_metrics, indent=4))
 
     # Check for any loose ends and warn before exiting
     seq_isns = len(fseq_info['fseq_pcs']) + len(fseq_info['cfg_buf'])