diff --git a/target/common/common.mk b/target/common/common.mk index 981ab8ab3d..1bc2a6865f 100644 --- a/target/common/common.mk +++ b/target/common/common.mk @@ -233,8 +233,8 @@ clean-perf: clean-visual-trace: rm -f $(VISUAL_TRACE) -$(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY) - $(DASM) < $< | $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt +$(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json dma_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY) + $(DASM) < $< | $(GENTRACE_PY) --permissive --dma-trace $(SIM_DIR)/dma_trace_$*.log --dump-hart-perf $(LOGS_DIR)/hart_$*_perf.json --dump-dma-perf $(LOGS_DIR)/dma_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt # Generate source-code interleaved traces for all harts. Reads the binary from # the logs/.rtlbinary file that is written at start of simulation in the vsim script diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py index 9a838cd69c..61b21d074f 100755 --- a/util/trace/gen_trace.py +++ b/util/trace/gen_trace.py @@ -23,6 +23,12 @@ Performance metrics are appended at the end of the generated trace and can optionally be dumped to a separate JSON file. + +It also computes various performance metrics for every DMA transfer, +provided that the Snitch core is equipped with a tightly-coupled DMA +engine, and the DMA trace logged during simulation +(see `axi_dma_backend.sv`) is fed to the tool. DMA performance +metrics are dumped to a separate JSON file. """ # TODO: OPER_TYPES and FPU_OPER_TYPES could break: optimization might alter enum mapping @@ -32,9 +38,10 @@ import re import argparse import json +import ast from ctypes import c_int32, c_uint32 from collections import deque, defaultdict -import pathlib +from pathlib import Path EXTRA_WB_WARN = 'WARNING: {} transactions still in flight for {}.' @@ -332,7 +339,7 @@ def load_opcodes(): global _cached_opcodes opcode_file_name = 'opcodes-flt-occamy_CUSTOM.csv' - opcode_file_path = pathlib.Path(__file__).parent.absolute() / opcode_file_name + opcode_file_path = Path(__file__).parent.absolute() / opcode_file_name _cached_opcodes = {} with open(opcode_file_path, 'r') as f: @@ -510,6 +517,109 @@ def flt_lit(num: int, fmt: int, width: int = 6, vlen: int = 1) -> str: return floats[0] +# -------------------- DMA -------------------- + + +# We always assume dma_trans contains at least one incomplete placeholder DMA transaction. +# This incomplete transaction contains default settings. Only upon a DMCPY* instruction +# is the size of the transaction known, completing the transaction. At that point, a new +# incomplete transaction is created, inheriting the configuration settings from the previous +# transaction, which may or may not be overriden before the next DMCPY*. +def update_dma(insn, extras, dma_trans): + # Extract instruction mnemonic from full instruction decoding (includes operand registers) + MNEMONIC_REGEX = r'^([\w.]+)\s' + match = re.match(MNEMONIC_REGEX, insn) + if match: + mnemonic = match.group(1) + # Process DMA instruction + if mnemonic in ['dmsrc', 'dmdst', 'dmstr']: + pass + elif mnemonic == 'dmrep': + dma_trans[-1]['rep'] = extras['opa'] + elif mnemonic in ['dmcpy', 'dmcpyi']: + # Create new placeholder transaction to inherit current DMA settings + dma_trans.append(dma_trans[-1].copy()) + # Set size of the transaction + dma_trans[-2]['size'] = extras['opa'] + # Override repetition count if the transaction is configured to be 1D + config = extras['rs2'] + enable_2d = (config & 2) >> 1 + if not enable_2d: + dma_trans[-2]['rep'] = 1 + + +def eval_dma_metrics(dma_trans, dma_trace): + dma_trace = Path(dma_trace) + if dma_trace.exists(): + with open(dma_trace, 'r') as f: + # Initialize variables + compl_transfers = [] + outst_transfers = [] + req_transfer_idx = 0 + req_bytes = 0 + # Iterate lines in DMA trace + for line in f.readlines(): + dma = ast.literal_eval(line) + if 'backend_burst_req_valid' in dma: + # When the first burst in a transfer is granted, we record a new transfer in + # the outstanding transfers queue, with the information obtained from the core + # trace. We record the number of bytes moved by each burst in a transfer, and + # compare the total to the number of bytes moved by the transfer, to count how + # many bursts belong to the current DMA transfer (a number which is difficult + # to pre-compute from the core trace as it depends on address alignments, etc.) + if dma['backend_burst_req_valid'] and dma['backend_burst_req_ready']: + if req_bytes == 0: + n_bytes = dma_trans[req_transfer_idx]['rep'] * \ + dma_trans[req_transfer_idx]['size'] + outst_transfers.append({'tstart': dma['time'], + 'exp_bursts': 0, + 'rec_bursts': 0, + 'bytes': n_bytes}) + req_bytes += dma['backend_burst_req_num_bytes'] + outst_transfers[-1]['exp_bursts'] += 1 + # We move on to the next transfer when the bytes requested by the previous + # bursts match the current transfer size. + if req_bytes == outst_transfers[-1]['bytes']: + req_bytes = 0 + req_transfer_idx += 1 + # Upon a burst completion, we increment the received bursts count. When this + # count matches the expected bursts count of the current transfer we record the + # end time of the transfer and promote the transfer from the outstanding to the + # completed transfers' queue. + if dma['transfer_completed']: + outst_transfers[0]['rec_bursts'] += 1 + if outst_transfers[0]['rec_bursts'] == outst_transfers[0]['exp_bursts']: + outst_transfers[0]['tend'] = dma['time'] + compl_transfer = outst_transfers.pop(0) + compl_transfer.pop('exp_bursts') + compl_transfer.pop('rec_bursts') + compl_transfers.append(compl_transfer) + # Calculate bandwidth of individual transfers + for transfer in compl_transfers: + transfer['cycles'] = transfer['tend'] - transfer['tstart'] + transfer['bw'] = transfer['bytes'] / transfer['cycles'] + # Calculate aggregate bandwidth: total number of bytes transferred while any transfer is + # active (accounts for overlaps between transfers). + prev_trans_end = 0 + active_cycles = 0 + n_bytes = 0 + for transfer in compl_transfers: + # Calculate active cycles, without double-counting overlaps + curr_trans_start, curr_trans_end = transfer['tstart'], transfer['tend'] + if curr_trans_start > prev_trans_end: + active_cycles += curr_trans_end - curr_trans_start + else: + active_cycles += curr_trans_end - prev_trans_end + prev_trans_end = curr_trans_end + # Calculate total number of bytes + n_bytes += transfer['bytes'] + dma_metrics = {} + if active_cycles != 0: + dma_metrics['aggregate_bw'] = n_bytes / active_cycles + dma_metrics['transfers'] = compl_transfers + return dma_metrics + + # -------------------- FPU Sequencer -------------------- @@ -772,7 +882,8 @@ def annotate_insn( annot_fseq_offl: bool = False, # Annotate whenever core offloads to CPU on own line force_hex_addr: bool = True, - permissive: bool = True + permissive: bool = True, + dma_trans: list = [] ) -> (str, tuple, bool ): # Return time info, whether trace line contains no info, and fseq_len match = re.search(TRACE_IN_REGEX, line.strip('\n')) @@ -801,6 +912,7 @@ def annotate_insn( insn, pc_str = ('', '') else: perf_metrics[-1]['snitch_issues'] += 1 + update_dma(insn, extras, dma_trans) # Annotate sequencer elif extras['source'] == TRACE_SRCES['sequencer']: if extras['cbuf_push']: @@ -961,12 +1073,20 @@ def main(): '--permissive', action='store_true', help='Ignore some state-related issues when they occur') - parser.add_argument('-d', - '--dump-perf', - nargs='?', - metavar='file', - type=argparse.FileType('w'), - help='Dump performance metrics as json text.') + parser.add_argument( + '--dma-trace', + help='Path to a DMA trace file' + ) + parser.add_argument( + '--dump-hart-perf', + nargs='?', + type=argparse.FileType('w'), + help='Dump hart performance metrics as json text.' + ) + parser.add_argument( + '--dump-dma-perf', + help='Dump DMA performance metrics as json text.' + ) args = parser.parse_args() line_iter = iter(args.infile.readline, b'') @@ -983,6 +1103,7 @@ def main(): 'cfg_buf': deque(), 'curr_cfg': None } + dma_trans = [{'rep': 1}] perf_metrics = [ defaultdict(int) ] # all values initially 0, also 'start' time of measurement 0 @@ -1008,10 +1129,18 @@ def main(): print('\n## Performance metrics', file=file) for idx in range(len(perf_metrics)): print('\n' + fmt_perf_metrics(perf_metrics, idx, not args.allkeys), file=file) + # Emit DMA metrics + if args.dma_trace: + dma_metrics = eval_dma_metrics(dma_trans, args.dma_trace) - if args.dump_perf: - with args.dump_perf as file: + # Dump hart performance metrics to JSON file + if args.dump_hart_perf: + with args.dump_hart_perf as file: file.write(json.dumps(perf_metrics, indent=4)) + # Dump DMA performance metrics to JSON file + if args.dump_dma_perf and dma_metrics is not None: + with open(args.dump_dma_perf, 'w') as file: + file.write(json.dumps(dma_metrics, indent=4)) # Check for any loose ends and warn before exiting seq_isns = len(fseq_info['fseq_pcs']) + len(fseq_info['cfg_buf'])