microsoft · yzygitzh · Dec 8, 2023 · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023
@@ -256,6 +256,9 @@ Measure the memory copy bandwidth performed by GPU SM/DMA engine, including devi
 | gpu[0-9]+\_and\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw   | bandwidth (GB/s) | Same as above, but generated by --dtoh --bidirectional.                                                                                  |
 | gpu[0-9]+\_and\_gpu[0-9]+\_by\_(sm\|dma)\_bw                | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing self's memory using DMA engine or GPU SM.                                     |
 | gpu[0-9]+\_and\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. |
+| gpu[0-9]+\_to\_gpu\_all\_write\_by\_sm\_bw                  | bandwidth (GB/s) | The unidirectional bandwidth of one GPU writing all peer GPUs' memory using GPU SM with peer communication enabled.                      |
+| gpu\_all\_to\_gpu[0-9]+\_write\_by\_sm\_bw                  | bandwidth (GB/s) | The unidirectional bandwidth of all peer GPUs writing one GPU's memory using GPU SM with peer communication enabled.                     |
+| gpu\_all\_to\_gpu\_all\_write\_by\_sm\_bw                   | bandwidth (GB/s) | The unidirectional bandwidth of all peer GPUs writing all peer GPUs' memory using GPU SM with peer communication enabled.                |
 
 ### `ib-loopback`
 

@@ -12,7 +12,9 @@
 
 if __name__ == '__main__':
     context = BenchmarkRegistry.create_benchmark_context(
-        'gpu-copy-bw', platform=Platform.CUDA, parameters='--mem_type htod dtoh dtod --copy_type sm dma'
+        'gpu-copy-bw',
+        platform=Platform.CUDA,
+        parameters='--mem_type htod dtoh dtod one_to_all all_to_one all_to_all --copy_type sm dma'
     )
     # For ROCm environment, please specify the benchmark name and the platform as the following.
     # context = BenchmarkRegistry.create_benchmark_context(

@@ -22,7 +22,7 @@ def __init__(self, name, parameters=''):
         super().__init__(name, parameters)
 
         self._bin_name = 'gpu_copy'
-        self._mem_types = ['htod', 'dtoh', 'dtod']
+        self._mem_types = ['htod', 'dtoh', 'dtod', 'one_to_all', 'all_to_one', 'all_to_all']
         self._copy_types = ['sm', 'dma']
 
     def add_parser_arguments(self):