Benchmarks: Micro benchmark - Add graph mode in NCCL/RCCL benchmarks …

…for latency metrics (#583) **Description** Revise NCCL/RCCL benchmarks to graph mode add latency metrics.
microsoft · Dec 5, 2023 · 254ea7f · 254ea7f
1 parent 9ae8c67
commit 254ea7f
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 3 deletions.
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py b/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py
@@ -88,6 +88,12 @@ def add_parser_arguments(self):
             default=5,
             help='Number of warmup iterations. Default: 5.',
         )
+        self._parser.add_argument(
+            '--graph_iters',
+            type=int,
+            default=0,
+            help='Number of graph launch iterations. Set to 0 to disable graph mode. Default: 0.',
+        )
 
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
@@ -117,9 +123,9 @@ def _preprocess(self):
                 return False
 
             command = os.path.join(self._args.bin_dir, self._bin_name)
-            command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {}'.format(
+            command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {} -G {}'.format(
                 self._args.minbytes, self._args.maxbytes, str(self._args.stepfactor), str(self._args.ngpus),
-                str(self._args.check), str(self._args.iters), str(self._args.warmup_iters)
+                str(self._args.check), str(self._args.iters), str(self._args.warmup_iters), str(self._args.graph_iters)
             )
             self._commands.append(command)
 

diff --git a/superbench/config/azure_ndmv4.yaml b/superbench/config/azure_ndmv4.yaml
@@ -73,6 +73,17 @@ superbench:
             NCCL_IB_DISABLE: '0'
       parameters:
         ngpus: 8
+    nccl-lat:default:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+      parameters:
+        maxbytes: 16M
+        warmup_iters: 20
+        iters: 1000
+        graph_iters: 1
     ib-loopback:
       enable: true
       modes:

diff --git a/superbench/config/azure_ndv4.yaml b/superbench/config/azure_ndv4.yaml
@@ -79,6 +79,17 @@ superbench:
             NCCL_IB_DISABLE: '0'
       parameters:
         ngpus: 8
+    nccl-lat:default:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+      parameters:
+        maxbytes: 16M
+        warmup_iters: 20
+        iters: 1000
+        graph_iters: 1
     ib-loopback:
       enable: true
       modes:

diff --git a/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py b/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py
@@ -65,6 +65,7 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
         assert (benchmark._args.check == 0)
         assert (benchmark._args.iters == 20)
         assert (benchmark._args.warmup_iters == 5)
+        assert (benchmark._args.graph_iters == 0)
 
         # Check command list
         bin_names = [
@@ -73,7 +74,7 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
         ]
 
         command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1]
-        expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5'.format(bin_names[0])
+        expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5 -G 0'.format(bin_names[0])
         assert (command == expected_command)
 
         # Check results and metrics.