Skip to content

Commit

Permalink
revision
Browse files Browse the repository at this point in the history
  • Loading branch information
yukirora committed Dec 5, 2023
1 parent 9851585 commit b315fc1
Show file tree
Hide file tree
Showing 8 changed files with 25 additions and 25 deletions.
25 changes: 11 additions & 14 deletions superbench/benchmarks/model_benchmarks/megatron_gpt3.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from mpi4py import MPI

from superbench.benchmarks import BenchmarkRegistry
from superbench.benchmarks.context import Precision
from superbench.benchmarks.context import Platform, Precision
from superbench.benchmarks.model_benchmarks.model_base import ModelBenchmark
from superbench.benchmarks.return_code import ReturnCode
from superbench.common.utils import logger, run_command
Expand Down Expand Up @@ -156,15 +156,6 @@ def _preprocess(self):

return True

def _is_rank_0(self):
"""Check if the rank is 0."""
# If it's invoked by MPI and rank is not 0, empty content is expected
if os.getenv('OMPI_COMM_WORLD_RANK'):
rank = int(os.getenv('OMPI_COMM_WORLD_RANK'))
if rank == 0:
return True
return False

def _parse_log(self, output):
"""Parse log output and get the performance."""
tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)')
Expand Down Expand Up @@ -333,7 +324,7 @@ def _train_step(self, precision): # noqa: E501
# last rank will print the result, first rank will print the memory usage
if self._num_nodes == 1 or \
int(os.environ['OMPI_COMM_WORLD_RANK']) == int(os.environ['OMPI_COMM_WORLD_SIZE']) - 1 \
or self._is_rank_0():
or int(os.environ['OMPI_COMM_WORLD_RANK']) == 0:
iteration_times, tflops, mem_allocated, max_mem_allocated = self._parse_log(output.stdout)
if len(tflops) > 0:
info['tflops'] = tflops
Expand Down Expand Up @@ -364,10 +355,15 @@ def _process_other_info(self, model_action, precision, info):
precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'bfloat16': 'bf16'}
if precision.value in precision_metric.keys():
precision = precision_metric[precision.value]
for metric, values in info.items():
metric = '{}_{}_{}'.format(precision, model_action, metric)
for key, values in info.items():
metric = '{}_{}_{}'.format(precision, model_action, key)
self._result.add_raw_data(metric, values, self._args.log_raw_data)
self._result.add_result(metric, statistics.mean(values))
logger.info(
'Average {} - round: {}, model: {}, precision: {}, value: {:.6f}.'.format(
key, self._curr_run_index, self._name, precision, statistics.mean(values)
)
)

def _judge_gpu_availability(self):
"""Judge GPUs' availability according to arguments and running environment."""
Expand Down Expand Up @@ -509,4 +505,5 @@ def _cal_params_count(self):


# Register GPT3 benchmark.
BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='')
BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA)
BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM)
13 changes: 8 additions & 5 deletions superbench/benchmarks/model_benchmarks/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import time
import statistics
from abc import abstractmethod
from typing import Union

from superbench.common.utils import logger, stdout_logger
from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
Expand Down Expand Up @@ -262,11 +263,13 @@ def __train(self, precision):
return False

# The unit of step time should be millisecond.
step_times, info = self._train_step(precision)
step_times = self._train_step(precision)
if isinstance(step_times, tuple):
info = step_times[1]
step_times = step_times[0]
self._process_other_info(ModelAction.TRAIN, precision, info)
step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
if info:
self._process_other_info(info)
if not step_times and not info:
if not step_times:
self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
return False

Expand Down Expand Up @@ -304,7 +307,7 @@ def __inference(self, precision):
return True

@abstractmethod
def _train_step(self, precision):
def _train_step(self, precision) -> Union[list, tuple]:
"""Define the training process.
Args:
Expand Down
2 changes: 1 addition & 1 deletion superbench/benchmarks/model_benchmarks/pytorch_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def _train_step(self, precision):
duration.append((end - start) * 1000)
self._log_step_time(curr_step, precision, duration)
if self._is_finished(curr_step, end, check_frequency):
return duration, None
return duration

def _inference_step(self, precision):
"""Define the inference process.
Expand Down
2 changes: 1 addition & 1 deletion superbench/benchmarks/model_benchmarks/pytorch_cnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def _train_step(self, precision):
duration.append((end - start) * 1000)
self._log_step_time(curr_step, precision, duration)
if self._is_finished(curr_step, end, check_frequency):
return duration, None
return duration

def _inference_step(self, precision):
"""Define the inference process.
Expand Down
2 changes: 1 addition & 1 deletion superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def _train_step(self, precision):
duration.append((end - start) * 1000)
self._log_step_time(curr_step, precision, duration)
if self._is_finished(curr_step, end, check_frequency):
return duration, None
return duration

def _inference_step(self, precision):
"""Define the inference process.
Expand Down
2 changes: 1 addition & 1 deletion superbench/benchmarks/model_benchmarks/pytorch_lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def _train_step(self, precision):
duration.append((end - start) * 1000)
self._log_step_time(curr_step, precision, duration)
if self._is_finished(curr_step, end, check_frequency):
return duration, None
return duration

def _inference_step(self, precision):
"""Define the inference process.
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmarks/model_benchmarks/test_model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _train_step(self, precision):
duration = []
for i in range(self._args.num_steps):
duration.append(2.0)
return duration, None
return duration

def _inference_step(self, precision):
"""Define the inference process.
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmarks/model_benchmarks/test_pytorch_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def _train_step(self, precision):
if idx >= self._args.num_warmup:
duration.append((end - start) * 1000)

return duration, None
return duration

def _inference_step(self, precision):
"""Define the inference process.
Expand Down

0 comments on commit b315fc1

Please sign in to comment.