revision

microsoft · Dec 5, 2023 · b315fc1 · b315fc1
1 parent 9851585
commit b315fc1
Show file tree

Hide file tree

Showing 8 changed files with 25 additions and 25 deletions.
diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -15,7 +15,7 @@
 from mpi4py import MPI
 
 from superbench.benchmarks import BenchmarkRegistry
-from superbench.benchmarks.context import Precision
+from superbench.benchmarks.context import Platform, Precision
 from superbench.benchmarks.model_benchmarks.model_base import ModelBenchmark
 from superbench.benchmarks.return_code import ReturnCode
 from superbench.common.utils import logger, run_command
@@ -156,15 +156,6 @@ def _preprocess(self):
 
         return True
 
-    def _is_rank_0(self):
-        """Check if the rank is 0."""
-        # If it's invoked by MPI and rank is not 0, empty content is expected
-        if os.getenv('OMPI_COMM_WORLD_RANK'):
-            rank = int(os.getenv('OMPI_COMM_WORLD_RANK'))
-            if rank == 0:
-                return True
-        return False
-
     def _parse_log(self, output):
         """Parse log output and get the performance."""
         tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)')
@@ -333,7 +324,7 @@ def _train_step(self, precision):    # noqa: E501
         # last rank will print the result, first rank will print the memory usage
         if self._num_nodes == 1 or \
             int(os.environ['OMPI_COMM_WORLD_RANK']) == int(os.environ['OMPI_COMM_WORLD_SIZE']) - 1 \
-                or self._is_rank_0():
+                or int(os.environ['OMPI_COMM_WORLD_RANK']) == 0:
             iteration_times, tflops, mem_allocated, max_mem_allocated = self._parse_log(output.stdout)
             if len(tflops) > 0:
                 info['tflops'] = tflops
@@ -364,10 +355,15 @@ def _process_other_info(self, model_action, precision, info):
         precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'bfloat16': 'bf16'}
         if precision.value in precision_metric.keys():
             precision = precision_metric[precision.value]
-        for metric, values in info.items():
-            metric = '{}_{}_{}'.format(precision, model_action, metric)
+        for key, values in info.items():
+            metric = '{}_{}_{}'.format(precision, model_action, key)
             self._result.add_raw_data(metric, values, self._args.log_raw_data)
             self._result.add_result(metric, statistics.mean(values))
+            logger.info(
+                'Average {} - round: {}, model: {}, precision: {}, value: {:.6f}.'.format(
+                    key, self._curr_run_index, self._name, precision, statistics.mean(values)
+                )
+            )
 
     def _judge_gpu_availability(self):
         """Judge GPUs' availability according to arguments and running environment."""
@@ -509,4 +505,5 @@ def _cal_params_count(self):
 
 
 # Register GPT3 benchmark.
-BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='')
+BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA)
+BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM)
diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -7,6 +7,7 @@
 import time
 import statistics
 from abc import abstractmethod
+from typing import Union
 
 from superbench.common.utils import logger, stdout_logger
 from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
@@ -262,11 +263,13 @@ def __train(self, precision):
             return False
 
         # The unit of step time should be millisecond.
-        step_times, info = self._train_step(precision)
+        step_times = self._train_step(precision)
+        if isinstance(step_times, tuple):
+            info = step_times[1]
+            step_times = step_times[0]
+            self._process_other_info(ModelAction.TRAIN, precision, info)
         step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
-        if info:
-            self._process_other_info(info)
-        if not step_times and not info:
+        if not step_times:
             self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
             return False
 
@@ -304,7 +307,7 @@ def __inference(self, precision):
         return True
 
     @abstractmethod
-    def _train_step(self, precision):
+    def _train_step(self, precision) -> Union[list, tuple]:
         """Define the training process.
 
         Args:

diff --git a/superbench/benchmarks/model_benchmarks/pytorch_bert.py b/superbench/benchmarks/model_benchmarks/pytorch_bert.py
@@ -190,7 +190,7 @@ def _train_step(self, precision):
                     duration.append((end - start) * 1000)
                     self._log_step_time(curr_step, precision, duration)
                 if self._is_finished(curr_step, end, check_frequency):
-                    return duration, None
+                    return duration
 
     def _inference_step(self, precision):
         """Define the inference process.

diff --git a/superbench/benchmarks/model_benchmarks/pytorch_cnn.py b/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
@@ -116,7 +116,7 @@ def _train_step(self, precision):
                     duration.append((end - start) * 1000)
                     self._log_step_time(curr_step, precision, duration)
                 if self._is_finished(curr_step, end, check_frequency):
-                    return duration, None
+                    return duration
 
     def _inference_step(self, precision):
         """Define the inference process.

diff --git a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
@@ -184,7 +184,7 @@ def _train_step(self, precision):
                     duration.append((end - start) * 1000)
                     self._log_step_time(curr_step, precision, duration)
                 if self._is_finished(curr_step, end, check_frequency):
-                    return duration, None
+                    return duration
 
     def _inference_step(self, precision):
         """Define the inference process.

diff --git a/superbench/benchmarks/model_benchmarks/pytorch_lstm.py b/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
@@ -156,7 +156,7 @@ def _train_step(self, precision):
                     duration.append((end - start) * 1000)
                     self._log_step_time(curr_step, precision, duration)
                 if self._is_finished(curr_step, end, check_frequency):
-                    return duration, None
+                    return duration
 
     def _inference_step(self, precision):
         """Define the inference process.

diff --git a/tests/benchmarks/model_benchmarks/test_model_base.py b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -82,7 +82,7 @@ def _train_step(self, precision):
         duration = []
         for i in range(self._args.num_steps):
             duration.append(2.0)
-        return duration, None
+        return duration
 
     def _inference_step(self, precision):
         """Define the inference process.

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
@@ -135,7 +135,7 @@ def _train_step(self, precision):
             if idx >= self._args.num_warmup:
                 duration.append((end - start) * 1000)
 
-        return duration, None
+        return duration
 
     def _inference_step(self, precision):
         """Define the inference process.