Expose nvbandwith variables and add test case for _preprocess.

microsoft · Nov 21, 2024 · 550219b · 550219b
1 parent 1fdeb9c
commit 550219b
Show file tree

Hide file tree

Showing 3 changed files with 250 additions and 54 deletions.
diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
@@ -28,6 +28,48 @@ def add_parser_arguments(self):
         """Add the specified arguments."""
         super().add_parser_arguments()
 
+        self._parser.add_argument(
+            '--buffer_size',
+            type=int,
+            default=64,
+            required=False,
+            help='Memcpy buffer size in MiB. Default is 64.',
+        )
+
+        self._parser.add_argument(
+            '--test_cases',
+            type=str,
+            default='',
+            required=False,
+            help='Specify the test case(s) to run, either by name or index. By default, all test cases are executed..',
+        )
+
+        self._parser.add_argument(
+            '--skip_verification',
+            action='store_true',
+            help='Skips data verification after copy. Default is False.',
+        )
+
+        self._parser.add_argument(
+            '--disable_affinity',
+            action='store_true',
+            help='Disable automatic CPU affinity control. Default is False.',
+        )
+
+        self._parser.add_argument(
+            '--use_mean',
+            action='store_true',
+            help='Use mean instead of median for results. Default is False.',
+        )
+
+        self._parser.add_argument(
+            '--num_loops',
+            type=int,
+            default=3,
+            required=False,
+            help='Iterations of the benchmark. Default is 3.',
+        )
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -43,6 +85,24 @@ def _preprocess(self):
         # Construct the command for nvbandwidth
         command = os.path.join(self._args.bin_dir, self._bin_name)
 
+        if self._args.buffer_size:
+            command += f' --bufferSize {self._args.buffer_size}'
+
+        if self._args.test_cases:
+            command += ' --testcase ' + ' '.join([testcase.strip() for testcase in self._args.test_cases.split(',')])
+
+        if self._args.skip_verification:
+            command += ' --skipVerification'
+
+        if self._args.disable_affinity:
+            command += ' --disableAffinity'
+
+        if self._args.use_mean:
+            command += ' --useMean'
+
+        if self._args.num_loops:
+            command += f' --testSamples {self._args.num_loops}'
+
         self._commands.append(command)
 
         return True
@@ -57,17 +117,17 @@ def _process_raw_line(self, line, parse_status):
             line (str): A single line of raw output from the benchmark.
             parse_status (dict): A dictionary to maintain the current parsing state and results. It should contain:
                 - 'test_name' (str): The name of the current test being parsed.
-                - 'parsing_matrix' (bool): A flag indicating if matrix data is being parsed.
+                - 'benchmark_type' (str): 'bw' or 'lat'. It also indicating if matrix data is being parsed.
                 - 'matrix_header' (list): The header of the matrix being parsed.
                 - 'results' (dict): A dictionary to store the parsed results.
 
-        Returns:
+        Return:
             None
         """
         # Regular expressions for summary line and matrix header detection
         block_start_pattern = re.compile(r'^Running\s+(.+)$')
         summary_pattern = re.compile(r'SUM (\S+) (\d+\.\d+)')
-        matrix_header_line = re.compile(r'^memcpy CE CPU\(row\)')
+        matrix_header_line = re.compile(r'^(memcpy|memory latency)')
         matrix_row_pattern = re.compile(r'^\s*\d')
 
         line = line.strip()
@@ -79,25 +139,26 @@ def _process_raw_line(self, line, parse_status):
 
         # Detect the start of matrix data
         if parse_status['test_name'] and matrix_header_line.match(line):
-            parse_status['parsing_matrix'] = True
+            parse_status['benchmark_type'] = 'bw' if 'bandwidth' in line else 'lat'
             return
 
         # Parse the matrix header
         if (
-            parse_status['test_name'] and parse_status['parsing_matrix'] and not parse_status['matrix_header']
+            parse_status['test_name'] and parse_status['benchmark_type'] and not parse_status['matrix_header']
             and matrix_row_pattern.match(line)
         ):
             parse_status['matrix_header'] = line.split()
             return
 
         # Parse matrix rows
-        if parse_status['test_name'] and parse_status['parsing_matrix'] and matrix_row_pattern.match(line):
+        if parse_status['test_name'] and parse_status['benchmark_type'] and matrix_row_pattern.match(line):
             row_data = line.split()
             row_index = row_data[0]
             for col_index, value in enumerate(row_data[1:], start=1):
                 col_header = parse_status['matrix_header'][col_index - 1]
                 test_name = parse_status['test_name']
-                metric_name = f'{test_name}_bandwidth_cpu{row_index}_gpu{col_header}'
+                benchmark_type = parse_status['benchmark_type']
+                metric_name = f'{test_name}_cpu{row_index}_gpu{col_header}_{benchmark_type}'
                 parse_status['results'][metric_name] = float(value)
             return
 
@@ -106,11 +167,12 @@ def _process_raw_line(self, line, parse_status):
         if summary_match:
             value = float(summary_match.group(2))
             test_name = parse_status['test_name']
-            parse_status['results'][f'{test_name}_sum_bandwidth'] = value
+            benchmark_type = parse_status['benchmark_type']
+            parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = value
 
             # Reset parsing state for next test
             parse_status['test_name'] = ''
-            parse_status['parsing_matrix'] = False
+            parse_status['benchmark_type'] = None
             parse_status['matrix_header'].clear()
 
     def _process_raw_result(self, cmd_idx, raw_output):
@@ -130,7 +192,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
             content = raw_output.splitlines()
             parsing_status = {
                 'results': {},
-                'parsing_matrix': False,
+                'benchmark_type': None,
                 'matrix_header': [],
                 'test_name': '',
             }

diff --git a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
@@ -5,6 +5,7 @@
 
 import unittest
 
+from tests.helper import decorator
 from tests.helper.testcase import BenchmarkTestCase
 from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
 
@@ -18,63 +19,62 @@ def setUpClass(cls):
         cls.createMockEnvs(cls)
         cls.createMockFiles(cls, ['bin/nvbandwidth'])
 
-    def test_nvbandwidth_result_parsing_real_output(self):
-        """Test NV Bandwidth benchmark result parsing."""
+    def test_nvbandwidth_preprocess(self):
+        """Test NV Bandwidth benchmark preprocess."""
         benchmark_name = 'nvbandwidth'
         (benchmark_class,
          predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
         assert (benchmark_class)
 
+        # Test preprocess with default parameters
         benchmark = benchmark_class(benchmark_name, parameters='')
-
-        # Preprocess and validate command
         assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
 
-        # Provided raw output
-        raw_output = """
-        nvbandwidth Version: v0.6
-        Built from Git version:
-
-        CUDA Runtime Version: 12040
-        CUDA Driver Version: 12040
-        Driver Version: 550.54.15
-
-        Device 0: NVIDIA GH200 480GB (00000009:01:00)
-
-        Running host_to_device_memcpy_ce.
-        memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)
-                0         1
-        0    337.55    2142.4
-        1    2142.4    337.55
-
-        SUM host_to_device_memcpy_ce 337.55
+        # Test preprocess with specified parameters
+        parameters = (
+            '--buffer_size 256 '
+            '--test_cases 0,1,2,19,20 '
+            '--skip_verification '
+            '--disable_affinity '
+            '--use_mean '
+            '--num_loops 100'
+        )
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
 
-        Running device_to_host_memcpy_ce.
-        memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)
-                0         1        2
-        0    295.23     241.2    254.0
-        1    241.2      295.2    254.0
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--bufferSize 256' in benchmark._commands[0])
+        assert ('--testcase 0 1 2 19 20' in benchmark._commands[0])
+        assert ('--skipVerification' in benchmark._commands[0])
+        assert ('--disableAffinity' in benchmark._commands[0])
+        assert ('--useMean' in benchmark._commands[0])
+        assert ('--testSamples 100' in benchmark._commands[0])
+
+    @decorator.load_data('tests/data/nvbandwidth_results.log')
+    def test_nvbandwidth_result_parsing_real_output(self, results):
+        """Test NV Bandwidth benchmark result parsing."""
+        benchmark_name = 'nvbandwidth'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
 
-        SUM device_to_host_memcpy_ce 295.23
+        benchmark = benchmark_class(benchmark_name, parameters='')
 
-        Waived:
-        Waived:
-        Waived:
-        Running host_to_device_bidirectional_memcpy_ce.
-        memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
-                0
-        0    160.02
-        Waived:
-        """
+        # Preprocess and validate command
+        assert benchmark._preprocess()
 
         # Parse the provided raw output
-        assert benchmark._process_raw_result(0, raw_output)
+        assert benchmark._process_raw_result(0, results)
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        assert benchmark.result['host_to_device_memcpy_ce_bandwidth_cpu0_gpu0'][0] == 337.55
-        assert benchmark.result['host_to_device_memcpy_ce_bandwidth_cpu0_gpu1'][0] == 2142.4
-        assert benchmark.result['device_to_host_memcpy_ce_bandwidth_cpu0_gpu1'][0] == 241.2
-        assert benchmark.result['device_to_host_memcpy_ce_sum_bandwidth'][0] == 295.23
-        assert 'host_to_device_bidirectional_memcpy_ce_bandwidth_cpu0_gpu0' in benchmark.result
-        assert benchmark.result['host_to_device_bidirectional_memcpy_ce_bandwidth_cpu0_gpu0'][0] == 160.02
+        assert benchmark.result['host_to_device_memcpy_ce_cpu0_gpu0_bw'][0] == 369.36
+        assert benchmark.result['host_to_device_memcpy_ce_cpu0_gpu1_bw'][0] == 269.33
+        assert benchmark.result['host_to_device_memcpy_ce_sum_bw'][0] == 1985.60
+        assert benchmark.result['device_to_host_memcpy_ce_cpu0_gpu1_bw'][0] == 312.11
+        assert benchmark.result['device_to_host_memcpy_ce_sum_bw'][0] == 607.26
+        assert benchmark.result['host_device_latency_sm_cpu0_gpu0_lat'][0] == 772.58
+        assert benchmark.result['host_device_latency_sm_sum_lat'][0] == 772.58
diff --git a/tests/data/nvbandwidth_results.log b/tests/data/nvbandwidth_results.log
@@ -0,0 +1,134 @@
+nvbandwidth Version: v0.6
+Built from Git version: v0.6
+
+CUDA Runtime Version: 12040
+CUDA Driver Version: 12040
+Driver Version: 550.54.15
+
+Device 0: NVIDIA GH200 480GB (00000009:01:00)
+
+Running host_to_device_memcpy_ce.
+memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)
+           0         1         2
+ 0    369.36    269.33    412.11
+ 1    323.36    299.33    312.11
+
+SUM host_to_device_memcpy_ce 1985.60
+
+Running device_to_host_memcpy_ce.
+memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)
+           0         1
+ 0    295.15    312.11
+
+SUM device_to_host_memcpy_ce 607.26
+
+Running host_to_device_bidirectional_memcpy_ce.
+memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    176.92
+
+SUM host_to_device_bidirectional_memcpy_ce 176.92
+
+Running device_to_host_bidirectional_memcpy_ce.
+memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    187.26
+
+SUM device_to_host_bidirectional_memcpy_ce 187.26
+
+Waived:
+Waived:
+Waived:
+Waived:
+Running all_to_host_memcpy_ce.
+memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)
+           0
+ 0    295.15
+
+SUM all_to_host_memcpy_ce 295.15
+
+Running all_to_host_bidirectional_memcpy_ce.
+memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    187.00
+
+SUM all_to_host_bidirectional_memcpy_ce 187.00
+
+Running host_to_all_memcpy_ce.
+memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)
+           0
+ 0    370.13
+
+SUM host_to_all_memcpy_ce 370.13
+
+Running host_to_all_bidirectional_memcpy_ce.
+memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    176.86
+
+SUM host_to_all_bidirectional_memcpy_ce 176.86
+
+Waived:
+Waived:
+Waived:
+Waived:
+Running host_to_device_memcpy_sm.
+memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)
+           0
+ 0    372.33
+
+SUM host_to_device_memcpy_sm 372.33
+
+Running device_to_host_memcpy_sm.
+memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)
+           0
+ 0    351.93
+
+SUM device_to_host_memcpy_sm 351.93
+
+Waived:
+Waived:
+Waived:
+Waived:
+Running all_to_host_memcpy_sm.
+memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)
+           0
+ 0    352.98
+
+SUM all_to_host_memcpy_sm 352.98
+
+Running all_to_host_bidirectional_memcpy_sm.
+memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    156.53
+
+SUM all_to_host_bidirectional_memcpy_sm 156.53
+
+Running host_to_all_memcpy_sm.
+memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)
+           0
+ 0    360.93
+
+SUM host_to_all_memcpy_sm 360.93
+
+Running host_to_all_bidirectional_memcpy_sm.
+memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    247.56
+
+SUM host_to_all_bidirectional_memcpy_sm 247.56
+
+Waived:
+Waived:
+Waived:
+Waived:
+Running host_device_latency_sm.
+memory latency SM CPU(row) <-> GPU(column) (ns)
+           0
+ 0    772.58
+
+SUM host_device_latency_sm 772.58
+
+Waived:
+NOTE: The reported results may not reflect the full capabilities of the platform.
+Performance can vary with software drivers, hardware clocks, and system topology.