Skip to content

Commit

Permalink
Expose nvbandwith variables and add test case for _preprocess.
Browse files Browse the repository at this point in the history
  • Loading branch information
hongtaozhang committed Nov 21, 2024
1 parent 1fdeb9c commit 550219b
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 54 deletions.
82 changes: 72 additions & 10 deletions superbench/benchmarks/micro_benchmarks/nvbandwidth.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,48 @@ def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()

self._parser.add_argument(
'--buffer_size',
type=int,
default=64,
required=False,
help='Memcpy buffer size in MiB. Default is 64.',
)

self._parser.add_argument(
'--test_cases',
type=str,
default='',
required=False,
help='Specify the test case(s) to run, either by name or index. By default, all test cases are executed..',
)

self._parser.add_argument(
'--skip_verification',
action='store_true',
help='Skips data verification after copy. Default is False.',
)

self._parser.add_argument(
'--disable_affinity',
action='store_true',
help='Disable automatic CPU affinity control. Default is False.',
)

self._parser.add_argument(
'--use_mean',
action='store_true',
help='Use mean instead of median for results. Default is False.',
)

self._parser.add_argument(
'--num_loops',
type=int,
default=3,
required=False,
help='Iterations of the benchmark. Default is 3.',
)

def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Expand All @@ -43,6 +85,24 @@ def _preprocess(self):
# Construct the command for nvbandwidth
command = os.path.join(self._args.bin_dir, self._bin_name)

if self._args.buffer_size:
command += f' --bufferSize {self._args.buffer_size}'

if self._args.test_cases:
command += ' --testcase ' + ' '.join([testcase.strip() for testcase in self._args.test_cases.split(',')])

if self._args.skip_verification:
command += ' --skipVerification'

if self._args.disable_affinity:
command += ' --disableAffinity'

if self._args.use_mean:
command += ' --useMean'

if self._args.num_loops:
command += f' --testSamples {self._args.num_loops}'

self._commands.append(command)

return True
Expand All @@ -57,17 +117,17 @@ def _process_raw_line(self, line, parse_status):
line (str): A single line of raw output from the benchmark.
parse_status (dict): A dictionary to maintain the current parsing state and results. It should contain:
- 'test_name' (str): The name of the current test being parsed.
- 'parsing_matrix' (bool): A flag indicating if matrix data is being parsed.
- 'benchmark_type' (str): 'bw' or 'lat'. It also indicating if matrix data is being parsed.
- 'matrix_header' (list): The header of the matrix being parsed.
- 'results' (dict): A dictionary to store the parsed results.
Returns:
Return:
None
"""
# Regular expressions for summary line and matrix header detection
block_start_pattern = re.compile(r'^Running\s+(.+)$')
summary_pattern = re.compile(r'SUM (\S+) (\d+\.\d+)')
matrix_header_line = re.compile(r'^memcpy CE CPU\(row\)')
matrix_header_line = re.compile(r'^(memcpy|memory latency)')
matrix_row_pattern = re.compile(r'^\s*\d')

line = line.strip()
Expand All @@ -79,25 +139,26 @@ def _process_raw_line(self, line, parse_status):

# Detect the start of matrix data
if parse_status['test_name'] and matrix_header_line.match(line):
parse_status['parsing_matrix'] = True
parse_status['benchmark_type'] = 'bw' if 'bandwidth' in line else 'lat'
return

# Parse the matrix header
if (
parse_status['test_name'] and parse_status['parsing_matrix'] and not parse_status['matrix_header']
parse_status['test_name'] and parse_status['benchmark_type'] and not parse_status['matrix_header']
and matrix_row_pattern.match(line)
):
parse_status['matrix_header'] = line.split()
return

# Parse matrix rows
if parse_status['test_name'] and parse_status['parsing_matrix'] and matrix_row_pattern.match(line):
if parse_status['test_name'] and parse_status['benchmark_type'] and matrix_row_pattern.match(line):
row_data = line.split()
row_index = row_data[0]
for col_index, value in enumerate(row_data[1:], start=1):
col_header = parse_status['matrix_header'][col_index - 1]
test_name = parse_status['test_name']
metric_name = f'{test_name}_bandwidth_cpu{row_index}_gpu{col_header}'
benchmark_type = parse_status['benchmark_type']
metric_name = f'{test_name}_cpu{row_index}_gpu{col_header}_{benchmark_type}'
parse_status['results'][metric_name] = float(value)
return

Expand All @@ -106,11 +167,12 @@ def _process_raw_line(self, line, parse_status):
if summary_match:
value = float(summary_match.group(2))
test_name = parse_status['test_name']
parse_status['results'][f'{test_name}_sum_bandwidth'] = value
benchmark_type = parse_status['benchmark_type']
parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = value

# Reset parsing state for next test
parse_status['test_name'] = ''
parse_status['parsing_matrix'] = False
parse_status['benchmark_type'] = None
parse_status['matrix_header'].clear()

def _process_raw_result(self, cmd_idx, raw_output):
Expand All @@ -130,7 +192,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
content = raw_output.splitlines()
parsing_status = {
'results': {},
'parsing_matrix': False,
'benchmark_type': None,
'matrix_header': [],
'test_name': '',
}
Expand Down
88 changes: 44 additions & 44 deletions tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import unittest

from tests.helper import decorator
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform

Expand All @@ -18,63 +19,62 @@ def setUpClass(cls):
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/nvbandwidth'])

def test_nvbandwidth_result_parsing_real_output(self):
"""Test NV Bandwidth benchmark result parsing."""
def test_nvbandwidth_preprocess(self):
"""Test NV Bandwidth benchmark preprocess."""
benchmark_name = 'nvbandwidth'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
assert (benchmark_class)

# Test preprocess with default parameters
benchmark = benchmark_class(benchmark_name, parameters='')

# Preprocess and validate command
assert benchmark._preprocess()
assert benchmark.return_code == ReturnCode.SUCCESS

# Provided raw output
raw_output = """
nvbandwidth Version: v0.6
Built from Git version:
CUDA Runtime Version: 12040
CUDA Driver Version: 12040
Driver Version: 550.54.15
Device 0: NVIDIA GH200 480GB (00000009:01:00)
Running host_to_device_memcpy_ce.
memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)
0 1
0 337.55 2142.4
1 2142.4 337.55
SUM host_to_device_memcpy_ce 337.55
# Test preprocess with specified parameters
parameters = (
'--buffer_size 256 '
'--test_cases 0,1,2,19,20 '
'--skip_verification '
'--disable_affinity '
'--use_mean '
'--num_loops 100'
)
benchmark = benchmark_class(benchmark_name, parameters=parameters)
assert benchmark._preprocess()
assert benchmark.return_code == ReturnCode.SUCCESS

Running device_to_host_memcpy_ce.
memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)
0 1 2
0 295.23 241.2 254.0
1 241.2 295.2 254.0
# Check command
assert (1 == len(benchmark._commands))
assert ('--bufferSize 256' in benchmark._commands[0])
assert ('--testcase 0 1 2 19 20' in benchmark._commands[0])
assert ('--skipVerification' in benchmark._commands[0])
assert ('--disableAffinity' in benchmark._commands[0])
assert ('--useMean' in benchmark._commands[0])
assert ('--testSamples 100' in benchmark._commands[0])

@decorator.load_data('tests/data/nvbandwidth_results.log')
def test_nvbandwidth_result_parsing_real_output(self, results):
"""Test NV Bandwidth benchmark result parsing."""
benchmark_name = 'nvbandwidth'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
assert (benchmark_class)

SUM device_to_host_memcpy_ce 295.23
benchmark = benchmark_class(benchmark_name, parameters='')

Waived:
Waived:
Waived:
Running host_to_device_bidirectional_memcpy_ce.
memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
0
0 160.02
Waived:
"""
# Preprocess and validate command
assert benchmark._preprocess()

# Parse the provided raw output
assert benchmark._process_raw_result(0, raw_output)
assert benchmark._process_raw_result(0, results)
assert benchmark.return_code == ReturnCode.SUCCESS

# Validate parsed results
assert benchmark.result['host_to_device_memcpy_ce_bandwidth_cpu0_gpu0'][0] == 337.55
assert benchmark.result['host_to_device_memcpy_ce_bandwidth_cpu0_gpu1'][0] == 2142.4
assert benchmark.result['device_to_host_memcpy_ce_bandwidth_cpu0_gpu1'][0] == 241.2
assert benchmark.result['device_to_host_memcpy_ce_sum_bandwidth'][0] == 295.23
assert 'host_to_device_bidirectional_memcpy_ce_bandwidth_cpu0_gpu0' in benchmark.result
assert benchmark.result['host_to_device_bidirectional_memcpy_ce_bandwidth_cpu0_gpu0'][0] == 160.02
assert benchmark.result['host_to_device_memcpy_ce_cpu0_gpu0_bw'][0] == 369.36
assert benchmark.result['host_to_device_memcpy_ce_cpu0_gpu1_bw'][0] == 269.33
assert benchmark.result['host_to_device_memcpy_ce_sum_bw'][0] == 1985.60
assert benchmark.result['device_to_host_memcpy_ce_cpu0_gpu1_bw'][0] == 312.11
assert benchmark.result['device_to_host_memcpy_ce_sum_bw'][0] == 607.26
assert benchmark.result['host_device_latency_sm_cpu0_gpu0_lat'][0] == 772.58
assert benchmark.result['host_device_latency_sm_sum_lat'][0] == 772.58
134 changes: 134 additions & 0 deletions tests/data/nvbandwidth_results.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
nvbandwidth Version: v0.6
Built from Git version: v0.6

CUDA Runtime Version: 12040
CUDA Driver Version: 12040
Driver Version: 550.54.15

Device 0: NVIDIA GH200 480GB (00000009:01:00)

Running host_to_device_memcpy_ce.
memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)
0 1 2
0 369.36 269.33 412.11
1 323.36 299.33 312.11

SUM host_to_device_memcpy_ce 1985.60

Running device_to_host_memcpy_ce.
memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)
0 1
0 295.15 312.11

SUM device_to_host_memcpy_ce 607.26

Running host_to_device_bidirectional_memcpy_ce.
memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
0
0 176.92

SUM host_to_device_bidirectional_memcpy_ce 176.92

Running device_to_host_bidirectional_memcpy_ce.
memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
0
0 187.26

SUM device_to_host_bidirectional_memcpy_ce 187.26

Waived:
Waived:
Waived:
Waived:
Running all_to_host_memcpy_ce.
memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)
0
0 295.15

SUM all_to_host_memcpy_ce 295.15

Running all_to_host_bidirectional_memcpy_ce.
memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
0
0 187.00

SUM all_to_host_bidirectional_memcpy_ce 187.00

Running host_to_all_memcpy_ce.
memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)
0
0 370.13

SUM host_to_all_memcpy_ce 370.13

Running host_to_all_bidirectional_memcpy_ce.
memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
0
0 176.86

SUM host_to_all_bidirectional_memcpy_ce 176.86

Waived:
Waived:
Waived:
Waived:
Running host_to_device_memcpy_sm.
memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)
0
0 372.33

SUM host_to_device_memcpy_sm 372.33

Running device_to_host_memcpy_sm.
memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)
0
0 351.93

SUM device_to_host_memcpy_sm 351.93

Waived:
Waived:
Waived:
Waived:
Running all_to_host_memcpy_sm.
memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)
0
0 352.98

SUM all_to_host_memcpy_sm 352.98

Running all_to_host_bidirectional_memcpy_sm.
memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)
0
0 156.53

SUM all_to_host_bidirectional_memcpy_sm 156.53

Running host_to_all_memcpy_sm.
memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)
0
0 360.93

SUM host_to_all_memcpy_sm 360.93

Running host_to_all_bidirectional_memcpy_sm.
memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)
0
0 247.56

SUM host_to_all_bidirectional_memcpy_sm 247.56

Waived:
Waived:
Waived:
Waived:
Running host_device_latency_sm.
memory latency SM CPU(row) <-> GPU(column) (ns)
0
0 772.58

SUM host_device_latency_sm 772.58

Waived:
NOTE: The reported results may not reflect the full capabilities of the platform.
Performance can vary with software drivers, hardware clocks, and system topology.

0 comments on commit 550219b

Please sign in to comment.