From dd5a6329ed6e812de1035933af47b72c4f6155e2 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 7 Dec 2023 09:37:09 +0800 Subject: [PATCH] Benchmarks: Add benchmark: Megatron-LM/Megatron-Deepspeed GPT pretrain benchmark (#582) **Description** Megatron-LM/Megatron-Deepspeed GPT pretrain benchmark --- dockerfile/cuda11.1.1.dockerfile | 1 + dockerfile/cuda12.2.dockerfile | 1 + dockerfile/rocm5.0.x.dockerfile | 3 +- dockerfile/rocm5.1.x.dockerfile | 3 +- .../benchmarks/model-benchmarks.md | 23 +- setup.py | 1 + .../benchmarks/model_benchmarks/__init__.py | 3 +- .../model_benchmarks/megatron_gpt3.py | 508 ++++++ .../benchmarks/model_benchmarks/model_base.py | 20 +- superbench/config/default.yaml | 17 + .../model_benchmarks/test_megatron_gpt.py | 357 ++++ tests/data/megatron_deepspeed.log | 1461 +++++++++++++++++ third_party/Makefile | 23 +- third_party/Megatron/requirements.txt | 13 + 14 files changed, 2425 insertions(+), 9 deletions(-) create mode 100644 superbench/benchmarks/model_benchmarks/megatron_gpt3.py create mode 100644 tests/benchmarks/model_benchmarks/test_megatron_gpt.py create mode 100644 tests/data/megatron_deepspeed.log create mode 100644 third_party/Megatron/requirements.txt diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 060985db1..6300eb4bd 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -41,6 +41,7 @@ RUN apt-get update && \ libtinfo5 \ libtool \ lshw \ + python3-mpi4py \ net-tools \ openssh-client \ openssh-server \ diff --git a/dockerfile/cuda12.2.dockerfile b/dockerfile/cuda12.2.dockerfile index a3cf01f33..d3ce049cb 100644 --- a/dockerfile/cuda12.2.dockerfile +++ b/dockerfile/cuda12.2.dockerfile @@ -41,6 +41,7 @@ RUN apt-get update && \ libtinfo5 \ libtool \ lshw \ + python3-mpi4py \ net-tools \ openssh-client \ openssh-server \ diff --git a/dockerfile/rocm5.0.x.dockerfile b/dockerfile/rocm5.0.x.dockerfile index 93db745e5..d9cdeb307 100644 --- a/dockerfile/rocm5.0.x.dockerfile +++ b/dockerfile/rocm5.0.x.dockerfile @@ -41,6 +41,7 @@ RUN apt-get update && \ libtinfo5 \ libtool \ lshw \ + python3-mpi4py \ net-tools \ numactl \ openssh-client \ @@ -136,7 +137,7 @@ RUN echo PATH="$PATH" > /etc/environment && \ WORKDIR ${SB_HOME} ADD third_party third_party -RUN make -C third_party rocm -o rocm_hipblaslt +RUN make -C third_party rocm -o rocm_hipblaslt -o megatron_deepspeed -o megatron_lm ADD . . RUN python3 -m pip install --upgrade setuptools==65.7 && \ diff --git a/dockerfile/rocm5.1.x.dockerfile b/dockerfile/rocm5.1.x.dockerfile index 3933f8ac7..8401935d6 100644 --- a/dockerfile/rocm5.1.x.dockerfile +++ b/dockerfile/rocm5.1.x.dockerfile @@ -41,6 +41,7 @@ RUN apt-get update && \ libtinfo5 \ libtool \ lshw \ + python3-mpi4py \ net-tools \ numactl \ openssh-client \ @@ -141,7 +142,7 @@ RUN echo PATH="$PATH" > /etc/environment && \ WORKDIR ${SB_HOME} ADD third_party third_party -RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm -o rocm_hipblaslt +RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm -o rocm_hipblaslt -o megatron_deepspeed -o megatron_lm ADD . . RUN python3 -m pip install --no-cache-dir .[amdworker] && \ diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md index 1b14e28e6..34fdf4c70 100644 --- a/docs/user-tutorial/benchmarks/model-benchmarks.md +++ b/docs/user-tutorial/benchmarks/model-benchmarks.md @@ -37,8 +37,29 @@ For inference, supported percentiles include | Name | Unit | Description | |-----------------------------------------------------------------------------------------|------------------------|------------------------------------------------------------------------------| | model-benchmarks/pytorch-${model_name}/${precision}_train_step_time | time (ms) | The average training step time with fp32/fp16 precision. | -| model-benchmarks/pytorch-${model_name}/${precision}_train_throughput | throughput (samples/s) | The average training throughput with fp32/fp16 precision. | +| model-benchmarks/pytorch-${model_name}/${precision}_train_throughput | throughput (samples/s) | The average training throughput with fp32/fp16 precision per GPU. | | model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time | time (ms) | The average inference step time with fp32/fp16 precision. | | model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput | throughput (samples/s) | The average inference throughput with fp32/fp16 precision. | | model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time\_${percentile} | time (ms) | The nth percentile inference step time with fp32/fp16 precision. | | model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput\_${percentile} | throughput (samples/s) | The nth percentile inference throughput with fp32/fp16 precision. | + + +## Megatron Model benchmarks + +### `megatron-gpt` + +#### Introduction + +Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) or [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed). + +`tips: batch_size in this benchmark represents global batch size, the batch size on each GPU instance is micro_batch_size.` + +#### Metrics +| Name | Unit | Description | +|---------------------------------------------------|------------------------|---------------------------------------------------------| +| megatron-gpt/${precision}_train_step_time | time (ms) | The average training step time per iteration. | +| megatron-gpt/${precision}_train_throughput | throughput (samples/s) | The average training throughput per iteration. | +| megatron-gpt/${precision}_train_tflops | tflops/s | The average training tflops per second per iteration. | +| megatron-gpt/${precision}_train_mem_allocated | GB | The average GPU memory allocated per iteration. | +| megatron-gpt/${precision}_train_max_mem_allocated | GB | The average maximum GPU memory allocated per iteration. | + diff --git a/setup.py b/setup.py index 027add3eb..c78988d11 100644 --- a/setup.py +++ b/setup.py @@ -177,6 +177,7 @@ def run(self): 'xlrd>=2.0.1', 'xlsxwriter>=1.3.8', 'xmltodict>=0.12.0', + 'types-requests', ], extras_require=( lambda x: { diff --git a/superbench/benchmarks/model_benchmarks/__init__.py b/superbench/benchmarks/model_benchmarks/__init__.py index 7625cd0ac..eda0c4985 100644 --- a/superbench/benchmarks/model_benchmarks/__init__.py +++ b/superbench/benchmarks/model_benchmarks/__init__.py @@ -8,5 +8,6 @@ from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import PytorchGPT2 from superbench.benchmarks.model_benchmarks.pytorch_cnn import PytorchCNN from superbench.benchmarks.model_benchmarks.pytorch_lstm import PytorchLSTM +from superbench.benchmarks.model_benchmarks.megatron_gpt3 import MegatronGPT -__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM'] +__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM', 'MegatronGPT'] diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py new file mode 100644 index 000000000..233109780 --- /dev/null +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -0,0 +1,508 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Module of the megatron deepspeed GPT pretrain class.""" + +import json +import os +import statistics +import numpy as np +import requests +import torch +from pathlib import Path +import re + +from superbench.benchmarks import BenchmarkRegistry +from superbench.benchmarks.context import Platform, Precision +from superbench.benchmarks.model_benchmarks.model_base import ModelBenchmark +from superbench.benchmarks.return_code import ReturnCode +from superbench.common.utils import logger, run_command + + +def download_file(url, path): + """Download file from url to path.""" + response = requests.get(url) + with open(path, 'wb') as file: + file.write(response.content) + + +class MegatronGPT(ModelBenchmark): + """The Megatron DeepSpeed GPT pretrain benchmark class.""" + def __init__(self, name, parameters=''): + """Constructor. + + Args: + name (str): benchmark name. + parameters (str): parameters of the benchmark. + """ + super().__init__(name, parameters) + self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16, Precision.BFLOAT16] + + def add_parser_arguments(self): + """Add the specified arguments.""" + super().add_parser_arguments() + self._parser.add_argument('--code_base', type=str, required=False, default='', help='Code base.') + self._parser.add_argument('--dataset_url', type=str, required=False, default=None, help='Dataset URL.') + self._parser.add_argument( + '--vocab_url', + type=str, + required=False, + default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json', + help='Vocab URL.' + ) + self._parser.add_argument( + '--merges_url', + type=str, + required=False, + default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt', + help='Merges URL.' + ) + self._parser.add_argument( + '--tokenizer_type', type=str, required=False, default='GPT2BPETokenizer', help='Tokenizer type.' + ) + self._parser.add_argument('--model_size', type=int, required=False, default=6.7, help='Model size.') + self._parser.add_argument('--num_layers', type=int, required=False, default=32, help='Number of layers.') + self._parser.add_argument('--hidden_size', type=int, required=False, default=4096, help='Hidden size.') + self._parser.add_argument( + '--num_attn_heads', type=int, required=False, default=32, help='Number of attention heads.' + ) + self._parser.add_argument('--micro_batch_size', type=int, required=False, default=2, help='micro batch size.') + self._parser.add_argument('--lr', type=float, required=False, default=1.2e-4, help='Learning rate.') + self._parser.add_argument('--min_lr', type=float, required=False, default=1.0e-6, help='Minimum learning rate.') + self._parser.add_argument('--init_std', type=float, required=False, default=0.009, help='Init std.') + self._parser.add_argument('--seq_len', type=int, required=False, default=2048, help='Sequence length.') + self._parser.add_argument( + '--tensor_model_parallel_size', type=int, required=False, default=1, help='Tensor model parallel size.' + ) + self._parser.add_argument( + '--pipeline_model_parallel_size', type=int, required=False, default=1, help='Pipeline model parallel size.' + ) + self._parser.add_argument( + '--num_gpus', type=int, required=False, default=8, help='Number of GPUs per node to run the benchmark.' + ) + self._parser.add_argument( + '--num_nodes', type=int, required=False, default=1, help='Number of nodes to run the benchmark.' + ) + self._parser.add_argument('--sequence_parallel', action='store_true', help='Enable Sequence parallel.') + self._parser.add_argument( + '--no_async_tensor_model_parallel_allreduce', + action='store_true', + help='No async tensor model parallel allreduce.' + ) + self._parser.add_argument( + '--use_rotary_position_embeddings', action='store_true', help='Use rotary position embeddings.' + ) + self._parser.add_argument( + '--no_gradient_accumulation_fusion', action='store_true', help='No gradient accumulation fusion.' + ) + self._parser.add_argument('--use_flash_attn', action='store_true', help='Use flash attention.') + self._parser.add_argument('--no_masked_softmax_fusion', action='store_true', help='No masked softmax fusion.') + self._parser.add_argument('--no_bias_gelu_fusion', action='store_true', help='No bias gelu fusion.') + self._parser.add_argument('--no_bias_dropout_fusion', action='store_true', help='No bias dropout fusion.') + self._parser.add_argument( + '--train_tokens', type=int, required=False, default=300000000000, help='Train tokens.' + ) + # lr configs + # Parallelism configs + self._parser.add_argument('--zero_stage', type=int, default=1, help='Zero stage.') + # Misc configs + self._parser.add_argument('--log-interval', type=int, required=False, default=1, help='Log interval.') + self._parser.add_argument('--eval_iters', type=int, default=0, help='Eval iters.') + self._parser.add_argument('--eval_interval', type=int, default=10, help='Eval interval.') + self._parser.add_argument('--num_save', type=int, default=10000, help='Num save.') + self._parser.add_argument('--save_interval', type=int, default=10000, help='Save interval.') + # Output and data configs + self._parser.add_argument('--seed', type=int, default=1234, help='Seed.') + self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.') + self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.') + self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.') + self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') + self._parser.add_argument( + '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' + ) + self._parser.add_argument('--data_impl', type=str, default='mmap', help='Data impl.') + self._parser.add_argument('--data_prefix', type=str, default='dataset_text_document', help='Data prefix.') + self._parser.add_argument('--deepspeed', action='store_true', help='Use deepspeed.') + self._parser.add_argument('--extra', type=str, default=None, help='Extra options for Megatron.') + + def _preprocess(self): + if not super()._preprocess(): + return False + + if not os.path.exists(self._args.code_base) or \ + not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')): + logger.error('Code base is not valid.') + self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) + return False + + data_parallel_size = self._args.num_gpus * self._num_nodes \ + // self._args.pipeline_model_parallel_size // self._args.tensor_model_parallel_size + if self._args.micro_batch_size < 1 or \ + self._args.micro_batch_size > (self._args.batch_size // data_parallel_size): + logger.error('Micro Batch size * data parallel size is larger than global batch size.') + self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) + return False + + for precision in self._args.precision: + if precision not in self._supported_precision: + logger.error('Precision %s is not supported.' % precision) + self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) + return False + + if not os.path.exists(self._args.data_home): + os.makedirs(self._args.data_home) + + return True + + def _parse_log(self, output): + """Parse log output and get the performance.""" + tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)') + elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)') + mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B') + max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B') + lines = output.splitlines() + tflops = [] + mem_allocated = [] + max_mem_allocated = [] + iteration_times = [] + for line in lines: + if 'TFLOPs' in line: + tflops_matches = tflops_pattern.search(line) + elapsed_time_match = elapsed_time_pattern.search(line) + if tflops_matches: + tflops_values = float(tflops_matches.group(1)) + tflops.append(tflops_values) + if elapsed_time_match: + elapsed_time_value = float(elapsed_time_match.group(1)) + iteration_times.append(elapsed_time_value) + + if 'MaxMemAllocated' in line: + mem_allocated_match = mem_allocated_pattern.search(line) + max_mem_allocated_match = max_mem_allocated_pattern.search(line) + if mem_allocated_match: + mem_allocated_value = float(mem_allocated_match.group(1)) + mem_allocated.append(mem_allocated_value) + + if max_mem_allocated_match: + max_mem_allocated_value = float(max_mem_allocated_match.group(1)) + max_mem_allocated.append(max_mem_allocated_value) + + return iteration_times, tflops, mem_allocated, max_mem_allocated + + def __prepare_deespeed_config(self, precision_megatron): + """Prepare deepspeed configs.""" + self._config_json_path = os.path.join(self._args.data_home, 'ds_config_gpt.json') + # Load deepspeed config template json file + precision_template = { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'hysteresis': 2, + 'min_loss_scale': 1, + 'initial_scale_power': 11 + } + + ds_config_template = { + 'train_batch_size': self._args.batch_size, + 'train_micro_batch_size_per_gpu': self._args.micro_batch_size, + 'steps_per_print': self._args.log_interval, + 'zero_optimization': { + 'stage': self._args.zero_stage + }, + 'gradient_clipping': 1.0, + 'prescale_gradients': self._args.prescale_grad, + } + + if len(precision_megatron) > 0: + ds_config_template[precision_megatron] = precision_template + + # Write to config json file + with open(self._config_json_path, 'w') as file: + json.dump(ds_config_template, file, indent=4) + + deepspeed_options = f'\ + --deepspeed \ + --deepspeed_config {self._config_json_path} \ + --zero-stage {self._args.zero_stage} \ + --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}' + + if self._args.pipeline_model_parallel_size <= 1: + deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel' + return deepspeed_options + + def _megatron_command(self, precision): # noqa: C901 + """Generate megatron command.""" + if precision == Precision.FLOAT32: + precision_megatron = '' + elif precision == Precision.FLOAT16: + precision_megatron = '--fp16' + elif precision == Precision.BFLOAT16: + precision_megatron = '--bf16' + + megatron_options = f'\ + --override-opt_param-scheduler \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --tensor-model-parallel-size {self._args.tensor_model_parallel_size} \ + --init-method-std {self._args.init_std} \ + --lr-decay-samples 43945312 \ + --lr-warmup-samples {self._args.num_warmup * self._args.batch_size} \ + --lr-decay-style cosine \ + --micro-batch-size {self._args.micro_batch_size} \ + --global-batch-size {self._args.batch_size} \ + --num-layers {self._args.num_layers} \ + --hidden-size {self._args.hidden_size} \ + --num-attention-heads {self._args.num_attn_heads} \ + --seq-length {self._args.seq_len} \ + --max-position-embeddings {self._args.seq_len} \ + --train-tokens {self._args.train_tokens} \ + --train-samples {self._args.num_steps * self._args.batch_size} \ + --lr {self._args.lr} \ + --min-lr {self._args.min_lr} \ + --split 949,50,1 \ + --log-interval {self._args.log_interval} \ + --eval-interval {self._args.eval_interval} \ + --eval-iters {self._args.eval_iters} \ + --save-interval {self._args.save_interval} \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --hysteresis 2 \ + --num-workers {self._args.num_workers} \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --optimizer adam \ + --use-distributed-optimizer \ + {precision_megatron} \ + --seed {self._args.seed}' + + if self._args.sequence_parallel: + megatron_options = f'{megatron_options} --sequence-parallel' + if self._args.no_async_tensor_model_parallel_allreduce: + megatron_options = f'{megatron_options} --no-async-tensor-model-parallel-allreduce' + if self._args.use_rotary_position_embeddings: + megatron_options = f'{megatron_options} --use-rotary-position-embeddings' + if self._args.no_gradient_accumulation_fusion: + megatron_options = f'{megatron_options} --no-gradient-accumulation-fusion' + if self._args.use_flash_attn: + megatron_options = f'{megatron_options} --use-flash-attn' + if self._args.no_masked_softmax_fusion: + megatron_options = f'{megatron_options} --no-masked-softmax-fusion' + if self._args.no_bias_gelu_fusion: + megatron_options = f'{megatron_options} --no-bias-gelu-fusion' + if self._args.no_bias_dropout_fusion: + megatron_options = f'{megatron_options} --no-bias-dropout-fusion' + if self._args.extra: + megatron_options = f'{megatron_options} {self._args.extra}' + + command = '' + script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py') + if self._args.deepspeed: + deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) + if self._num_nodes > 1: + command = f'torchrun {self._distributed_args} ' + \ + f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}' + else: + command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}' + + else: + command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}' + + return command + + def _train_step(self, precision): # noqa: E501 + """Train the model and get the performance.""" + command = self._megatron_command(precision) + local_rank = os.environ.pop('OMPI_COMM_WORLD_LOCAL_RANK', None) + logger.info('Running command: {}.'.format(command)) + output = run_command(command, flush_output=True) + os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] = local_rank + + iteration_times = [] + info = {} + # last rank will print the result, first rank will print the memory usage + if self._num_nodes == 1 or \ + int(os.environ['OMPI_COMM_WORLD_RANK']) == int(os.environ['OMPI_COMM_WORLD_SIZE']) - 1 \ + or int(os.environ['OMPI_COMM_WORLD_RANK']) == 0: + iteration_times, tflops, mem_allocated, max_mem_allocated = self._parse_log(output.stdout) + if len(tflops) > 0: + info['tflops'] = tflops + if len(mem_allocated) > 0: + info['mem_allocated'] = mem_allocated + if len(max_mem_allocated) > 0: + info['max_mem_allocated'] = max_mem_allocated + if not iteration_times: + iteration_times = [-1 for i in range(self._args.num_steps)] + + return iteration_times, info + + def _sync_result(self, data): + """Sync the result of model benchmarking. + + Args: + data (list): the data to be reduced. + """ + from mpi4py import MPI + comm = MPI.COMM_WORLD + data = np.array(data, dtype=np.float64) + # Reduce the data to a single value on rank 0 + result = np.zeros_like(data) + comm.Allreduce([data, MPI.DOUBLE], [result, MPI.DOUBLE], op=MPI.MAX) + return result.tolist() + + def _process_info(self, model_action, precision, info): + """Process the result of model benchmarking.""" + precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'bfloat16': 'bf16'} + if precision.value in precision_metric.keys(): + precision = precision_metric[precision.value] + for key, values in info.items(): + metric = '{}_{}_{}'.format(precision, model_action, key) + self._result.add_raw_data(metric, values, self._args.log_raw_data) + self._result.add_result(metric, statistics.mean(values)) + logger.info( + 'Average {} - round: {}, model: {}, precision: {}, value: {:.6f}.'.format( + key, self._curr_run_index, self._name, precision, statistics.mean(values) + ) + ) + + def _judge_gpu_availability(self): + """Judge GPUs' availability according to arguments and running environment.""" + self._gpu_available = not self._args.no_gpu and torch.cuda.is_available() + + def _init_distributed_setting(self): + """Initialize the distributed library and bind the worker to GPU. + + Return: + True if distributed library is initialized successfully. + """ + if not os.getenv('OMPI_COMM_WORLD_SIZE'): + logger.error('MPI is not enabled.') + + return False + self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) + if self._num_nodes > 1: + if not self._args.hostfile: + sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile') + if os.path.exists(sb_hostfile): + hosts = open(sb_hostfile).read().split('\n') + hosts = [f'{host} slots={self._args.num_gpus}' for host in hosts if host != ''] + self._args.hostfile = os.path.join(self._args.data_home, 'hostfile') + with open(self._args.hostfile, 'w') as file: + file.write('\n'.join(hosts)) + if not os.path.exists(self._args.hostfile): + logger.error('Hostfile not found.') + return False + hosts = open(self._args.hostfile, 'r').readlines() + if self._num_nodes != len(hosts): + logger.error('MPI init failed since hostfile not match the MPI setting.') + return False + + addr = os.getenv('MASTER_ADDR', hosts[0].split()[0]) + port = os.getenv('MASTER_PORT', '29500') + node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) + self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \ + f'--node_rank {node_rank} --master_addr {addr} --master_port {port}' + return True + + def _generate_dataset(self): + """Generate dataset for benchmarking. + + Return: + True if dataset is created successfully. + """ + self._vocab_path = str(Path(self._args.data_home) / 'gpt2-vocab.json') + download_file(self._args.vocab_url, self._vocab_path) + self._merges_path = str(Path(self._args.data_home) / 'gpt2-merges.txt') + download_file(self._args.merges_url, self._merges_path) + + if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ + or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): + if self._args.dataset_url: + self._raw_data_path = str(Path(self._args.data_home) / 'data.json') + download_file(self._args.dataset_url, self._raw_data_path) + command = ( + 'python3 ' + f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' + f'--input {self._raw_data_path} ' + f'--tokenizer-type {self._args.tokenizer_type} ' + f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' + f'--workers {str(self._args.num_workers)} ' + f'--vocab-file {self._vocab_path} ' + f'--merge-file {self._merges_path}' + ) + + # split documents + run_command(command, flush_output=True) + # binarize dataset + run_command(command, flush_output=True) + if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ + or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): + logger.error('Dataset failed to generate.') + self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) + return False + else: + logger.error('No dataset or dataset url provided.') + self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) + return False + + self._data_path = os.path.join(self._args.data_home, f'{self._args.data_prefix}') + self._data_options = f'\ + --vocab-file {self._vocab_path} \ + --merge-file {self._merges_path} \ + --data-path {self._data_path} \ + --data-impl {self._args.data_impl}' + + logger.info('Dataset preparation successfully.') + return True + + def _set_force_fp32(self): + """Set force FP32.""" + pass + + def _init_dataloader(self): + """Initialize the dataloader. + + Return: + True if dataloader is created successfully. + """ + return True + + def _create_optimizer(self): + """Create the optimzier instance used for training and wrap with distributed library if need. + + Return: + True if optimizer instance is created successfully. + """ + return True + + def _create_model(self, precision): + """Construct the model for benchmarking. + + Args: + precision (Precision): precision of model and input data, such as float32, float16. + """ + return True + + def _inference_step(self, precision): + """Define the inference process. + + Args: + precision (Precision): precision of model and input data, + such as float32, float16. + + Return: + The latency list of every inference operation. + """ + pass + + def _cal_params_count(self): + """Calculate the parameters scale of the model. + + Return: + The count of trainable parameters. + """ + pass + + +# Register GPT3 benchmark. +BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA) +BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM) diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py index 6238c2b0e..d1da27b43 100644 --- a/superbench/benchmarks/model_benchmarks/model_base.py +++ b/superbench/benchmarks/model_benchmarks/model_base.py @@ -7,6 +7,7 @@ import time import statistics from abc import abstractmethod +from typing import Union from superbench.common.utils import logger, stdout_logger from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode @@ -263,6 +264,10 @@ def __train(self, precision): # The unit of step time should be millisecond. step_times = self._train_step(precision) + if isinstance(step_times, tuple): + step_times = step_times[0] + info = step_times[1] + self._process_info(ModelAction.TRAIN, precision, info) step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times) if not step_times: self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT) @@ -302,7 +307,7 @@ def __inference(self, precision): return True @abstractmethod - def _train_step(self, precision): + def _train_step(self, precision) -> Union[list, tuple]: """Define the training process. Args: @@ -418,6 +423,7 @@ def __process_model_result(self, model_action, precision, step_times): precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'float64': 'fp64', 'bfloat16': 'bf16'} if precision.value in precision_metric.keys(): precision = precision_metric[precision.value] + metric_s = '{}_{}_step_time'.format(precision, model_action) metric_t = '{}_{}_throughput'.format(precision, model_action) # The unit of step time is millisecond, use it to calculate the throughput with the unit samples/sec. @@ -428,7 +434,7 @@ def __process_model_result(self, model_action, precision, step_times): if model_action == ModelAction.TRAIN: step_times = self._sync_result(step_times) - if not step_times: + if not step_times or statistics.mean(step_times) < 0: return None if self._local_rank is None or self._global_rank == 0: self._result.add_result(metric_s, statistics.mean(step_times)) @@ -468,3 +474,13 @@ def _log_step_time(self, curr_step, precision, duration): step_time = statistics.mean(duration) if len(duration) < self._args.log_n_steps \ else statistics.mean(duration[-self._args.log_n_steps:]) stdout_logger.log(f'{self._name} - {precision.value}: step {curr_step}, step time {step_time}\n') + + def _process_info(self, model_action, precision, info): + """Process other info. + + Args: + model_action (ModelAction): train or inference. + precision (Precision): precision of model. + info (dict): other info. + """ + pass diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 1a6af7dc5..7bade5d37 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -207,6 +207,23 @@ superbench: seq_length: 224 batch_size: 1 precision: int8 + megatron-gpt: + modes: + - name: mpi + proc_num: 1 + node_num: all + parameters: + code_base: /opt/superbench/third_party/Megatron/Megatron-DeepSpeed/ + dataset_url: https://huggingface.co/datasets/suolyer/pile_bookcorpus2/raw/main/test.json + batch_size: 2048 + num_warmup: 0 + num_steps: 10 + precision: + - float16 + - bfloat16 + deepspeed: yes + sequence_parallel: yes + use_rotary_position_embeddings: yes gpt_models: <<: *default_pytorch_mode models: diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py new file mode 100644 index 000000000..7a8be1aaf --- /dev/null +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -0,0 +1,357 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for BERT model benchmarks.""" + +import os +from pathlib import Path +import statistics +from unittest import mock +import unittest +from superbench.benchmarks.context import ModelAction, Precision + +from tests.helper import decorator +from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode +from tests.helper.testcase import BenchmarkTestCase + + +class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): + """Tests for IBBenchmark benchmark.""" + @classmethod + def setUpClass(cls): + """Hook method for setting up class fixture before running tests in the class.""" + super().setUpClass() + cls.benchmark_name = 'megatron-gpt' + cls.createMockEnvs(cls) + cls.hostfile_path = os.path.join(cls._tmp_dir, 'hostfile') + + @classmethod + def tearDownClass(cls): + """Hook method for deconstructing the class fixture after running all tests in the class.""" + for p in [ + Path(cls._tmp_dir) / 'pretrain_gpt.py', + Path(cls._tmp_dir) / 'customdataset_text_document.bin', + Path(cls._tmp_dir) / 'customdataset_text_document.idx', + Path(cls._tmp_dir) / 'hostfile' + ]: + if p.is_file(): + p.unlink() + super().tearDownClass() + + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') + def test_megatron_gpt_preprocess(self, mock_generate_dataset): + """Test megatron-gpt benchmark.""" + # Check registry. + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + assert (benchmark_cls) + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--hostfile {self.hostfile_path} --batch_size 2048', + ) + + # Check init distribued setting. + os.environ['OMPI_COMM_WORLD_SIZE'] = '2' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + with open(self.hostfile_path, 'w') as f: + f.write('host1\n') + f.write('host2\n') + f.write('host3\n') + mock_generate_dataset.return_value = True + ret = benchmark._preprocess() + assert (ret is False) + assert (benchmark.return_code == ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE) + + benchmark = benchmark_cls( + self.benchmark_name, + parameters='--hostfile xxx --batch_size 2048', + ) + mock_generate_dataset.return_value = True + ret = benchmark._preprocess() + assert (ret is False) + assert (benchmark.return_code == ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE) + + os.environ['OMPI_COMM_WORLD_SIZE'] = '3' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--hostfile {self.hostfile_path} --batch_size 2048', + ) + mock_generate_dataset.return_value = True + benchmark._preprocess() + self.assertEqual(benchmark._num_nodes, 3) + self.assertEqual( + benchmark._distributed_args, + '--nproc_per_node {0} --nnodes {1} --node_rank {2} --master_addr {3} --master_port {4}'.format( + benchmark._args.num_gpus, benchmark._num_nodes, 0, 'localhost', '12345' + ) + ) + + # Check preprocessing. + # Negative cases + # no code_base + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --batch_size 2048', + ) + mock_generate_dataset.return_value = True + ret = benchmark._preprocess() + assert (ret is False) + self.createMockFiles(['pretrain_gpt.py']) + # invalid micro batch size + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --micro_batch_size -1', + ) + mock_generate_dataset.return_value = True + ret = benchmark._preprocess() + assert (ret is False) + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --micro_batch_size 4096', + ) + mock_generate_dataset.return_value = True + ret = benchmark._preprocess() + assert (ret is False) + # invalid precision + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ + --batch_size 2048 --precision int8', + ) + mock_generate_dataset.return_value = True + ret = benchmark._preprocess() + assert (ret is False) + # Positive cases + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --batch_size 2048', + ) + mock_generate_dataset.return_value = True + ret = benchmark._preprocess() + assert (ret is True) + + def test_megatron_gpt_dataset(self): + """Test dataset genreation.""" + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + assert (benchmark_cls) + os.environ['OMPI_COMM_WORLD_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + # use existing dataset + self.createMockFiles(['customdataset_text_document.bin', 'customdataset_text_document.idx']) + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} \ + --batch_size 2048 --data_prefix customdataset_text_document', + ) + ret = benchmark._preprocess() + ret = benchmark._generate_dataset() + assert (ret is True) + + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') + def test_megatron_gpt_command(self, mock_generate_dataset): + """Test command generation.""" + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + assert (benchmark_cls) + os.environ['OMPI_COMM_WORLD_SIZE'] = '2' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + with open(self.hostfile_path, 'w') as f: + f.write('host1\n') + f.write('host2\n') + # use url to process dataset + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ + --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document', + ) + mock_generate_dataset.return_value = True + benchmark._preprocess() + benchmark._data_options = f'\ + --vocab-file {self._tmp_dir}/gpt2-vocab.json \ + --merge-file {self._tmp_dir}/gpt2-merges.txt \ + --data-path {self._tmp_dir}/dataset_text_document \ + --data-impl mmap' + + script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py') + expected_command = 'torchrun {distributed_args} {script_path} \ + --override-opt_param-scheduler \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --tensor-model-parallel-size 1 \ + --init-method-std 0.009 \ + --lr-decay-samples 43945312 \ + --lr-warmup-samples 0 \ + --lr-decay-style cosine \ + --micro-batch-size 2 \ + --global-batch-size 2048 \ + --num-layers 32 \ + --hidden-size 4096 \ + --num-attention-heads 32 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --train-tokens 300000000000 \ + --train-samples 20480 \ + --lr 0.00012 \ + --min-lr 1e-06 \ + --split 949,50,1 \ + --log-interval 1 \ + --eval-interval 10 \ + --eval-iters 0 \ + --save-interval 10000 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --hysteresis 2 \ + --num-workers 8 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --optimizer adam \ + --use-distributed-optimizer \ + {precision} \ + --seed 1234 {data_options}' + + precision = Precision.FLOAT32 + command = benchmark._megatron_command(precision) + self.assertEqual( + command, + expected_command.format( + precision='', + data_options=benchmark._data_options, + distributed_args=benchmark._distributed_args, + script_path=script_path + ) + ) + precision = Precision.FLOAT16 + command = benchmark._megatron_command(precision) + self.assertEqual( + command, + expected_command.format( + precision='--fp16', + data_options=benchmark._data_options, + distributed_args=benchmark._distributed_args, + script_path=script_path + ) + ) + precision = Precision.BFLOAT16 + command = benchmark._megatron_command(precision) + self.assertEqual( + command, + expected_command.format( + precision='--bf16', + data_options=benchmark._data_options, + distributed_args=benchmark._distributed_args, + script_path=script_path + ) + ) + + os.environ['OMPI_COMM_WORLD_SIZE'] = '1' + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ + --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed', + ) + mock_generate_dataset.return_value = True + benchmark._preprocess() + benchmark._data_options = f'\ + --vocab-file {self._tmp_dir}/gpt2-vocab.json \ + --merge-file {self._tmp_dir}/gpt2-merges.txt \ + --data-path {self._tmp_dir}/dataset_text_document \ + --data-impl mmap' + + command = benchmark._megatron_command(Precision.BFLOAT16) + expected_command = 'deepspeed {script_path} \ + --override-opt_param-scheduler \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --tensor-model-parallel-size 1 \ + --init-method-std 0.009 \ + --lr-decay-samples 43945312 \ + --lr-warmup-samples 0 \ + --lr-decay-style cosine \ + --micro-batch-size 2 \ + --global-batch-size 2048 \ + --num-layers 32 \ + --hidden-size 4096 \ + --num-attention-heads 32 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --train-tokens 300000000000 \ + --train-samples 20480 \ + --lr 0.00012 \ + --min-lr 1e-06 \ + --split 949,50,1 \ + --log-interval 1 \ + --eval-interval 10 \ + --eval-iters 0 \ + --save-interval 10000 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --hysteresis 2 \ + --num-workers 8 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --optimizer adam \ + --use-distributed-optimizer \ + {precision} \ + --seed 1234 {data_options} {deepseed_options}' + + expect_ds_options = f'\ + --deepspeed \ + --deepspeed_config {benchmark._config_json_path} \ + --zero-stage 1 \ + --pipeline-model-parallel-size 1 --no-pipeline-parallel' + + self.assertEqual( + command, + expected_command.format( + precision='--bf16', + data_options=benchmark._data_options, + script_path=script_path, + deepseed_options=expect_ds_options + ) + ) + + @decorator.load_data('tests/data/megatron_deepspeed.log') + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') + def test_megatron_parse_log(self, raw_output, mock_generate_dataset): + """Test parse log function.""" + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + assert (benchmark_cls) + os.environ['OMPI_COMM_WORLD_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + + # use url to process dataset + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base {self._tmp_dir} --num_warmup 0 --num_steps 10 --batch_size 2048', + ) + mock_generate_dataset.return_value = True + benchmark._preprocess() + benchmark._data_options = f'\ + --vocab-file {self._tmp_dir}/gpt2-vocab.json \ + --merge-file {self._tmp_dir}/gpt2-merges.txt \ + --data-path {self._tmp_dir}/dataset_text_document \ + --data-impl mmap' + + iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output) + assert (statistics.mean(iteration_times) == 75239.24) + assert (statistics.mean(tflops) == 149.136) + assert (statistics.mean(mem_allocated) == 17.54) + assert (statistics.mean(max_mem_allocated) == 66.97) + + info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated} + benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info) + assert (benchmark.result is not None) + assert (benchmark.result['fp16_train_tflops'][0] == 149.136) + assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54) + assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97) diff --git a/tests/data/megatron_deepspeed.log b/tests/data/megatron_deepspeed.log new file mode 100644 index 000000000..61ba17a70 --- /dev/null +++ b/tests/data/megatron_deepspeed.log @@ -0,0 +1,1461 @@ +[2023-11-29 08:50:44,619] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-11-29 08:50:46,231] [INFO] [runner.py:463:main] Using IP address of 10.218.187.178 for node vm-07-05 +[2023-11-29 08:50:46,232] [INFO] [multinode_runner.py:72:get_cmd] Running on the following workers: vm-07-05,vm-07-14 +[2023-11-29 08:50:46,232] [INFO] [runner.py:570:main] cmd = pdsh -S -f 1024 -w vm-07-05,vm-07-14 export PYTHONPATH=/root/Megatron-DeepSpeed/examples_deepspeed/rebase::/root/Megatron-DeepSpeed; export UCX_HOME=/opt/ucx; cd /root/Megatron-DeepSpeed/examples_deepspeed/rebase; /opt/conda/envs/py_3.9/bin/python -u -m deepspeed.launcher.launch --world_info=eyJ2bS0wNy0wNSI6IFswLCAxLCAyLCAzLCA0LCA1LCA2LCA3XSwgInZtLTA3LTE0IjogWzAsIDEsIDIsIDMsIDQsIDUsIDYsIDddfQ== --node_rank=%n --master_addr=10.218.187.178 --master_port=29500 /root/Megatron-DeepSpeed/examples_deepspeed/rebase/../../pretrain_gpt.py --override-opt_param-scheduler --adam-beta1 '0.9' --adam-beta2 '0.95' --tensor-model-parallel-size '1' --init-method-std '0.009' --lr-decay-samples '43945312' --lr-warmup-samples '2048000' --lr-decay-style 'cosine' --micro-batch-size '2' --exit-duration-in-mins '30000000' --global-batch-size '2048' --num-layers '32' --hidden-size '4096' --num-attention-heads '32' --seq-length '2048' --max-position-embeddings '2048' --train-tokens '300000000000' --train-samples '10240' --lr '1.2e-4' --min-lr '1.0e-6' --split '949,50,1' --log-interval '1' --eval-interval '500' --eval-iters '10' --save-interval '10000' --weight-decay '0.1' --clip-grad '1.0' --hysteresis '2' --num-workers '2' --attention-dropout '0.0' --hidden-dropout '0.0' --optimizer 'adam' --use-distributed-optimizer --sequence-parallel --fp16 --seed '1234' --load './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase' --save './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase' --no-async-tensor-model-parallel-allreduce --use-rotary-position-embeddings --no-gradient-accumulation-fusion --vocab-file 'gpt2-vocab.json' --merge-file 'gpt2-merges.txt' --data-path '/root//dataset_text_sentence' --data-impl 'mmap' --deepspeed --deepspeed_config 'ds_config_gbs2048_mbs2_log1_zero1.json' --zero-stage '1' --pipeline-model-parallel-size '1' --no-pipeline-parallel +vm-07-05: [2023-11-29 08:50:48,288] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-14: [2023-11-29 08:50:48,369] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:145:main] WORLD INFO DICT: {'vm-07-05': [0, 1, 2, 3, 4, 5, 6, 7], 'vm-07-14': [0, 1, 2, 3, 4, 5, 6, 7]} +vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=8, node_rank=0 +vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'vm-07-05': [0, 1, 2, 3, 4, 5, 6, 7], 'vm-07-14': [8, 9, 10, 11, 12, 13, 14, 15]}) +vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:163:main] dist_world_size=16 +vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:145:main] WORLD INFO DICT: {'vm-07-05': [0, 1, 2, 3, 4, 5, 6, 7], 'vm-07-14': [0, 1, 2, 3, 4, 5, 6, 7]} +vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=8, node_rank=1 +vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'vm-07-05': [0, 1, 2, 3, 4, 5, 6, 7], 'vm-07-14': [8, 9, 10, 11, 12, 13, 14, 15]}) +vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:163:main] dist_world_size=16 +vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +vm-07-05: [2023-11-29 08:50:51,594] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-05: [2023-11-29 08:50:51,640] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-05: [2023-11-29 08:50:51,644] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-05: [2023-11-29 08:50:51,644] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-14: [2023-11-29 08:50:51,660] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-05: [2023-11-29 08:50:51,675] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-05: [2023-11-29 08:50:51,684] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-05: [2023-11-29 08:50:51,705] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-05: [2023-11-29 08:50:51,713] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-14: [2023-11-29 08:50:51,724] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-14: [2023-11-29 08:50:51,777] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-14: [2023-11-29 08:50:51,780] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-14: [2023-11-29 08:50:51,784] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-14: [2023-11-29 08:50:51,820] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-14: [2023-11-29 08:50:51,820] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-14: [2023-11-29 08:50:51,820] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +vm-07-05: Deterministic: False +vm-07-05: Performance Mode: True +vm-07-05: Using QLoop: True +vm-07-05: Deterministic: False +vm-07-05: Performance Mode: True +vm-07-05: Using QLoop: True +vm-07-05: Deterministic: False +vm-07-05: Performance Mode: True +vm-07-05: Using QLoop: True +vm-07-14: Deterministic: False +vm-07-14: Performance Mode: True +vm-07-14: Using QLoop: True +vm-07-05: Deterministic: False +vm-07-05: Performance Mode: True +vm-07-05: Using QLoop: True +vm-07-05: Deterministic: False +vm-07-05: Performance Mode: True +vm-07-05: Using QLoop: True +vm-07-05: Deterministic: False +vm-07-05: Performance Mode: True +vm-07-05: Using QLoop: True +vm-07-14: Deterministic: False +vm-07-14: Performance Mode: True +vm-07-14: Using QLoop: True +vm-07-05: Deterministic: False +vm-07-05: Performance Mode: True +vm-07-05: Using QLoop: True +vm-07-05: Deterministic: False +vm-07-05: Performance Mode: True +vm-07-05: Using QLoop: True +vm-07-14: Deterministic: False +vm-07-14: Performance Mode: True +vm-07-14: Using QLoop: True +vm-07-14: Deterministic: False +vm-07-14: Performance Mode: True +vm-07-14: Using QLoop: True +vm-07-14: Deterministic: False +vm-07-14: Performance Mode: True +vm-07-14: Using QLoop: True +vm-07-14: Deterministic: False +vm-07-14: Performance Mode: True +vm-07-14: Using QLoop: True +vm-07-14: Deterministic: False +vm-07-14: Performance Mode: True +vm-07-14: Using QLoop: True +vm-07-14: Deterministic: False +vm-07-14: Performance Mode: True +vm-07-14: Using QLoop: True +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed C++/CUDA extension op report +vm-07-05: -------------------------------------------------- +vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-05: runtime if needed. Op compatibility means that your system +vm-07-05: meet the required dependencies to JIT install the op. +vm-07-05: -------------------------------------------------- +vm-07-05: JIT compiled ops requires ninja +vm-07-05: ninja .................. [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: op name ................ installed .. compatible +vm-07-05: -------------------------------------------------- +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed C++/CUDA extension op report +vm-07-05: -------------------------------------------------- +vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-05: runtime if needed. Op compatibility means that your system +vm-07-05: meet the required dependencies to JIT install the op. +vm-07-05: -------------------------------------------------- +vm-07-05: JIT compiled ops requires ninja +vm-07-05: ninja .................. [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: op name ................ installed .. compatible +vm-07-05: -------------------------------------------------- +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed C++/CUDA extension op report +vm-07-05: -------------------------------------------------- +vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-05: runtime if needed. Op compatibility means that your system +vm-07-05: meet the required dependencies to JIT install the op. +vm-07-05: -------------------------------------------------- +vm-07-05: JIT compiled ops requires ninja +vm-07-05: ninja .................. [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: op name ................ installed .. compatible +vm-07-05: -------------------------------------------------- +vm-07-05: async_io ............... [NO] ....... [OKAY] +vm-07-05: fused_adam ............. [NO] ....... [OKAY] +vm-07-05: cpu_adam ............... [NO] ....... [OKAY] +vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-05: cpu_lion ............... [NO] ....... [OKAY] +vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-05: evoformer_attn ......... [NO] ....... [NO] +vm-07-05: fused_lamb ............. [NO] ....... [OKAY] +vm-07-05: fused_lion ............. [NO] ....... [OKAY] +vm-07-05: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-05: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-05: quantizer .............. [NO] ....... [OKAY] +vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-05: ragged_ops ............. [NO] ....... [OKAY] +vm-07-05: random_ltd ............. [NO] ....... [OKAY] +vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-05: sparse_attn ............ [NO] ....... [NO] +vm-07-05: spatial_inference ...... [NO] ....... [OKAY] +vm-07-05: transformer ............ [NO] ....... [OKAY] +vm-07-05: stochastic_transformer . [NO] ....... [OKAY] +vm-07-05: transformer_inference .. [NO] ....... [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed general environment info: +vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-05: torch version .................... 2.1.0a0+gita09f30a +vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-05: torch cuda version ............... None +vm-07-05: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-05: nvcc version ..................... None +vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-05: shared memory (/dev/shm) size .... 865.10 GB +vm-07-05: async_io ............... [NO] ....... [OKAY] +vm-07-05: fused_adam ............. [NO] ....... [OKAY] +vm-07-05: cpu_adam ............... [NO] ....... [OKAY] +vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-05: cpu_lion ............... [NO] ....... [OKAY] +vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-05: evoformer_attn ......... [NO] ....... [NO] +vm-07-05: fused_lamb ............. [NO] ....... [OKAY] +vm-07-05: fused_lion ............. [NO] ....... [OKAY] +vm-07-05: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-05: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-05: quantizer .............. [NO] ....... [OKAY] +vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-05: ragged_ops ............. [NO] ....... [OKAY] +vm-07-05: random_ltd ............. [NO] ....... [OKAY] +vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-05: sparse_attn ............ [NO] ....... [NO] +vm-07-05: spatial_inference ...... [NO] ....... [OKAY] +vm-07-05: transformer ............ [NO] ....... [OKAY] +vm-07-05: stochastic_transformer . [NO] ....... [OKAY] +vm-07-05: transformer_inference .. [NO] ....... [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed general environment info: +vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-05: torch version .................... 2.1.0a0+gita09f30a +vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-05: torch cuda version ............... None +vm-07-05: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-05: nvcc version ..................... None +vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-05: shared memory (/dev/shm) size .... 865.10 GB +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed C++/CUDA extension op report +vm-07-14: -------------------------------------------------- +vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-14: runtime if needed. Op compatibility means that your system +vm-07-14: meet the required dependencies to JIT install the op. +vm-07-14: -------------------------------------------------- +vm-07-14: JIT compiled ops requires ninja +vm-07-14: ninja .................. [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: op name ................ installed .. compatible +vm-07-14: -------------------------------------------------- +vm-07-05: async_io ............... [NO] ....... [OKAY] +vm-07-05: fused_adam ............. [NO] ....... [OKAY] +vm-07-05: cpu_adam ............... [NO] ....... [OKAY] +vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-05: cpu_lion ............... [NO] ....... [OKAY] +vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-05: evoformer_attn ......... [NO] ....... [NO] +vm-07-05: fused_lamb ............. [NO] ....... [OKAY] +vm-07-05: fused_lion ............. [NO] ....... [OKAY] +vm-07-05: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-05: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-05: quantizer .............. [NO] ....... [OKAY] +vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-05: ragged_ops ............. [NO] ....... [OKAY] +vm-07-05: random_ltd ............. [NO] ....... [OKAY] +vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-05: sparse_attn ............ [NO] ....... [NO] +vm-07-05: spatial_inference ...... [NO] ....... [OKAY] +vm-07-05: transformer ............ [NO] ....... [OKAY] +vm-07-05: stochastic_transformer . [NO] ....... [OKAY] +vm-07-05: transformer_inference .. [NO] ....... [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed general environment info: +vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-05: torch version .................... 2.1.0a0+gita09f30a +vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-05: torch cuda version ............... None +vm-07-05: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-05: nvcc version ..................... None +vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-05: shared memory (/dev/shm) size .... 865.10 GB +vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-05: INFO: overriding default arguments for tokenizer_type:None with tokenizer_type:GPT2BPETokenizer +vm-07-05: using world size: 16, data-parallel-size: 16, sequence-parallel size: 1, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +vm-07-05: using torch.float16 for parameters ... +vm-07-05: ------------------------ arguments ------------------------ +vm-07-05: accumulate_allreduce_grads_in_fp32 .............. False +vm-07-05: adam_beta1 ...................................... 0.9 +vm-07-05: adam_beta2 ...................................... 0.95 +vm-07-05: adam_eps ........................................ 1e-08 +vm-07-05: add_bias_linear ................................. True +vm-07-05: add_position_embedding .......................... False +vm-07-05: adlr_autoresume ................................. False +vm-07-05: adlr_autoresume_interval ........................ 1000 +vm-07-05: aml_data_download_path .......................... None +vm-07-05: apply_layernorm_1p .............................. False +vm-07-05: apply_query_key_layer_scaling ................... True +vm-07-05: apply_residual_connection_post_layernorm ........ False +vm-07-05: async_tensor_model_parallel_allreduce ........... False +vm-07-05: attention_dropout ............................... 0.0 +vm-07-05: attention_softmax_in_fp32 ....................... False +vm-07-05: barrier_with_L1_time ............................ True +vm-07-05: bert_binary_head ................................ True +vm-07-05: bert_embedder_type .............................. megatron +vm-07-05: bert_load ....................................... None +vm-07-05: bf16 ............................................ False +vm-07-05: bias_dropout_fusion ............................. True +vm-07-05: bias_gelu_fusion ................................ True +vm-07-05: biencoder_projection_dim ........................ 0 +vm-07-05: biencoder_shared_query_context_model ............ False +vm-07-05: block_data_path ................................. None +vm-07-05: checkpoint_activations .......................... False +vm-07-05: checkpoint_in_cpu ............................... False +vm-07-05: checkpoint_num_layers ........................... 1 +vm-07-05: classes_fraction ................................ 1.0 +vm-07-05: clip_grad ....................................... 1.0 +vm-07-05: compression_training ............................ False +vm-07-05: consumed_train_samples .......................... 0 +vm-07-05: consumed_train_tokens ........................... 0 +vm-07-05: consumed_valid_samples .......................... 0 +vm-07-05: contagious_checkpointing ........................ False +vm-07-05: cpu_optimizer ................................... False +vm-07-05: cpu_torch_adam .................................. False +vm-07-05: create_moe_param_group .......................... False +vm-07-05: curriculum_learning_legacy ...................... False +vm-07-05: data_cache_path ................................. None +vm-07-05: data_efficiency_curriculum_learning ............. False +vm-07-05: data_impl ....................................... mmap +vm-07-05: data_parallel_random_init ....................... False +vm-07-05: data_parallel_size .............................. 16 +vm-07-05: data_path ....................................... ['/root//dataset_text_sentence'] +vm-07-05: data_per_class_fraction ......................... 1.0 +vm-07-05: data_sharding ................................... True +vm-07-05: dataloader_type ................................. single +vm-07-05: DDP_impl ........................................ local +vm-07-05: decoder_num_layers .............................. None +vm-07-05: decoder_seq_length .............................. None +vm-07-05: deepscale ....................................... False +vm-07-05: deepscale_config ................................ None +vm-07-05: deepspeed ....................................... True +vm-07-05: deepspeed_activation_checkpointing .............. False +vm-07-05: deepspeed_config ................................ ds_config_gbs2048_mbs2_log1_zero1.json +vm-07-05: deepspeed_mpi ................................... False +vm-07-05: dino_bottleneck_size ............................ 256 +vm-07-05: dino_freeze_last_layer .......................... 1 +vm-07-05: dino_head_hidden_size ........................... 2048 +vm-07-05: dino_local_crops_number ......................... 10 +vm-07-05: dino_local_img_size ............................. 96 +vm-07-05: dino_norm_last_layer ............................ False +vm-07-05: dino_teacher_temp ............................... 0.07 +vm-07-05: dino_warmup_teacher_temp ........................ 0.04 +vm-07-05: dino_warmup_teacher_temp_epochs ................. 30 +vm-07-05: distribute_checkpointed_activations ............. False +vm-07-05: distribute_saved_activations .................... False +vm-07-05: distributed_backend ............................. nccl +vm-07-05: distributed_timeout_minutes ..................... 10 +vm-07-05: ds_inference .................................... False +vm-07-05: ds_pipeline_enabled ............................. False +vm-07-05: ds_sequence_parallel_size ....................... 1 +vm-07-05: embedding_path .................................. None +vm-07-05: embedding_weights_in_fp32 ....................... False +vm-07-05: empty_unused_memory_level ....................... 0 +vm-07-05: enable_expert_tensor_parallelism ................ False +vm-07-05: encoder_num_layers .............................. 32 +vm-07-05: encoder_seq_length .............................. 2048 +vm-07-05: end_weight_decay ................................ 0.1 +vm-07-05: eod_mask_loss ................................... False +vm-07-05: eval_interval ................................... 500 +vm-07-05: eval_iters ...................................... 10 +vm-07-05: evidence_data_path .............................. None +vm-07-05: exit_duration_in_mins ........................... 30000000 +vm-07-05: exit_interval ................................... None +vm-07-05: exit_on_missing_checkpoint ...................... False +vm-07-05: exit_signal_handler ............................. False +vm-07-05: expert_interval ................................. 2 +vm-07-05: ffn_hidden_size ................................. 16384 +vm-07-05: finetune ........................................ False +vm-07-05: force_ds_sequence_parallel ...................... False +vm-07-05: fp16 ............................................ True +vm-07-05: fp16_lm_cross_entropy ........................... False +vm-07-05: fp32_residual_connection ........................ False +vm-07-05: fp8_amax_compute_algo ........................... most_recent +vm-07-05: fp8_amax_history_len ............................ 1 +vm-07-05: fp8_e4m3 ........................................ False +vm-07-05: fp8_hybrid ...................................... False +vm-07-05: fp8_interval .................................... 1 +vm-07-05: fp8_margin ...................................... 0 +vm-07-05: fp8_wgrad ....................................... True +vm-07-05: global_batch_size ............................... 2048 +vm-07-05: gradient_accumulation_fusion .................... False +vm-07-05: head_lr_mult .................................... 1.0 +vm-07-05: hidden_dropout .................................. 0.0 +vm-07-05: hidden_size ..................................... 4096 +vm-07-05: hidden_size_teacher ............................. None +vm-07-05: hysteresis ...................................... 2 +vm-07-05: ict_head_size ................................... None +vm-07-05: ict_load ........................................ None +vm-07-05: img_h ........................................... 224 +vm-07-05: img_w ........................................... 224 +vm-07-05: indexer_batch_size .............................. 128 +vm-07-05: indexer_log_interval ............................ 1000 +vm-07-05: inference ....................................... False +vm-07-05: inference_batch_times_seqlen_threshold .......... 512 +vm-07-05: init_method_std ................................. 0.009 +vm-07-05: init_method_xavier_uniform ...................... False +vm-07-05: initial_loss_scale .............................. 4294967296 +vm-07-05: iter_per_epoch .................................. 1250 +vm-07-05: kd .............................................. False +vm-07-05: kd_alpha_ce ..................................... 1 +vm-07-05: kd_beta_ce ...................................... 1 +vm-07-05: kd_temp ......................................... 1.0 +vm-07-05: kv_channels ..................................... 128 +vm-07-05: layernorm_epsilon ............................... 1e-05 +vm-07-05: lazy_mpu_init ................................... None +vm-07-05: load ............................................ .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase +vm-07-05: load_teacher .................................... None +vm-07-05: local_rank ...................................... 0 +vm-07-05: log_batch_size_to_tensorboard ................... False +vm-07-05: log_interval .................................... 1 +vm-07-05: log_learning_rate_to_tensorboard ................ True +vm-07-05: log_loss_scale_to_tensorboard ................... True +vm-07-05: log_memory_to_tensorboard ....................... False +vm-07-05: log_num_zeros_in_grad ........................... False +vm-07-05: log_optimizer_states_to_tensorboard ............. False +vm-07-05: log_params_norm ................................. False +vm-07-05: log_timers_to_tensorboard ....................... False +vm-07-05: log_validation_ppl_to_tensorboard ............... False +vm-07-05: log_world_size_to_tensorboard ................... False +vm-07-05: loss_scale ...................................... None +vm-07-05: loss_scale_window ............................... 1000 +vm-07-05: lr .............................................. 0.00012 +vm-07-05: lr_decay_iters .................................. None +vm-07-05: lr_decay_samples ................................ 43945312 +vm-07-05: lr_decay_style .................................. cosine +vm-07-05: lr_decay_tokens ................................. None +vm-07-05: lr_warmup_fraction .............................. None +vm-07-05: lr_warmup_iters ................................. 0 +vm-07-05: lr_warmup_samples ............................... 2048000 +vm-07-05: lr_warmup_tokens ................................ None +vm-07-05: make_vocab_size_divisible_by .................... 128 +vm-07-05: mask_factor ..................................... 1.0 +vm-07-05: mask_prob ....................................... 0.15 +vm-07-05: mask_type ....................................... random +vm-07-05: masked_softmax_fusion ........................... True +vm-07-05: max_position_embeddings ......................... 2048 +vm-07-05: max_tokens_to_oom ............................... 12000 +vm-07-05: mem_efficient_ln ................................ True +vm-07-05: memory_centric_tiled_linear ..................... False +vm-07-05: merge_file ...................................... gpt2-merges.txt +vm-07-05: micro_batch_size ................................ 2 +vm-07-05: min_loss_scale .................................. 1.0 +vm-07-05: min_lr .......................................... 1e-06 +vm-07-05: mlp_type ........................................ standard +vm-07-05: mmap_warmup ..................................... False +vm-07-05: moe_eval_capacity_factor ........................ 1.0 +vm-07-05: moe_expert_parallel_size ........................ 1 +vm-07-05: moe_loss_coeff .................................. 0.1 +vm-07-05: moe_min_capacity ................................ 4 +vm-07-05: moe_token_dropping .............................. True +vm-07-05: moe_train_capacity_factor ....................... 1.0 +vm-07-05: mos ............................................. False +vm-07-05: no_load_lr_state ................................ False +vm-07-05: no_load_optim ................................... None +vm-07-05: no_load_rng ..................................... None +vm-07-05: no_persist_layer_norm ........................... False +vm-07-05: no_pipeline_parallel ............................ True +vm-07-05: no_save_optim ................................... None +vm-07-05: no_save_rng ..................................... None +vm-07-05: normalization ................................... layernorm +vm-07-05: num_attention_heads ............................. 32 +vm-07-05: num_attention_heads_teacher ..................... None +vm-07-05: num_channels .................................... 3 +vm-07-05: num_classes ..................................... 1000 +vm-07-05: num_experts ..................................... [1] +vm-07-05: num_experts_switch .............................. None +vm-07-05: num_experts_teacher ............................. [1] +vm-07-05: num_key_value_heads ............................. 32 +vm-07-05: num_layers ...................................... 32 +vm-07-05: num_layers_per_virtual_pipeline_stage ........... None +vm-07-05: num_layers_teacher .............................. None +vm-07-05: num_workers ..................................... 2 +vm-07-05: onnx_safe ....................................... None +vm-07-05: openai_gelu ..................................... False +vm-07-05: optimizer ....................................... adam +vm-07-05: output_bert_embeddings .......................... False +vm-07-05: overlap_p2p_comm ................................ False +vm-07-05: override_opt_param_scheduler .................... True +vm-07-05: params_dtype .................................... torch.float16 +vm-07-05: partition_activations ........................... False +vm-07-05: patch_dim ....................................... 16 +vm-07-05: perform_initialization .......................... True +vm-07-05: pipeline_model_parallel_size .................... 1 +vm-07-05: pipeline_model_parallel_split_rank .............. None +vm-07-05: profile_backward ................................ False +vm-07-05: query_in_block_prob ............................. 0.1 +vm-07-05: rampup_batch_size ............................... None +vm-07-05: random_ltd ...................................... False +vm-07-05: rank ............................................ 0 +vm-07-05: recompute_granularity ........................... None +vm-07-05: recompute_method ................................ None +vm-07-05: recompute_num_layers ............................ 1 +vm-07-05: remote_device ................................... none +vm-07-05: reset_attention_mask ............................ False +vm-07-05: reset_iteration ................................. False +vm-07-05: reset_position_ids .............................. False +vm-07-05: retriever_report_topk_accuracies ................ [] +vm-07-05: retriever_score_scaling ......................... False +vm-07-05: retriever_seq_length ............................ 256 +vm-07-05: retro_add_retriever ............................. False +vm-07-05: retro_cyclic_train_iters ........................ None +vm-07-05: retro_encoder_attention_dropout ................. 0.1 +vm-07-05: retro_encoder_hidden_dropout .................... 0.1 +vm-07-05: retro_encoder_layers ............................ 2 +vm-07-05: retro_num_neighbors ............................. 2 +vm-07-05: retro_num_retrieved_chunks ...................... 2 +vm-07-05: retro_return_doc_ids ............................ False +vm-07-05: retro_workdir ................................... None +vm-07-05: return_data_index ............................... False +vm-07-05: rotary_percent .................................. 1.0 +vm-07-05: sample_rate ..................................... 1.0 +vm-07-05: save ............................................ .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase +vm-07-05: save_interval ................................... 10000 +vm-07-05: scatter_gather_tensors_in_pipeline .............. True +vm-07-05: scattered_embeddings ............................ False +vm-07-05: seed ............................................ 1234 +vm-07-05: seq_length ...................................... 2048 +vm-07-05: sequence_parallel ............................... False +vm-07-05: sgd_momentum .................................... 0.9 +vm-07-05: short_seq_prob .................................. 0.1 +vm-07-05: skip_train ...................................... False +vm-07-05: split ........................................... 949,50,1 +vm-07-05: split_transformers .............................. False +vm-07-05: squared_relu .................................... False +vm-07-05: standalone_embedding_stage ...................... False +vm-07-05: start_weight_decay .............................. 0.1 +vm-07-05: swiglu .......................................... False +vm-07-05: swin_backbone_type .............................. tiny +vm-07-05: synchronize_each_layer .......................... False +vm-07-05: tensor_model_parallel_size ...................... 1 +vm-07-05: tensorboard_dir ................................. None +vm-07-05: tensorboard_log_interval ........................ 1 +vm-07-05: tensorboard_queue_size .......................... 1000 +vm-07-05: test_data_path .................................. None +vm-07-05: tile_factor ..................................... 1 +vm-07-05: timing_log_level ................................ 0 +vm-07-05: timing_log_option ............................... minmax +vm-07-05: titles_data_path ................................ None +vm-07-05: tokenizer_model ................................. None +vm-07-05: tokenizer_type .................................. GPT2BPETokenizer +vm-07-05: topk ............................................ 1 +vm-07-05: train_data_exact_num_epochs ..................... None +vm-07-05: train_data_path ................................. None +vm-07-05: train_desc_path ................................. None +vm-07-05: train_doc_idx_path .............................. None +vm-07-05: train_idx_path .................................. None +vm-07-05: train_iters ..................................... None +vm-07-05: train_sample_idx_path ........................... None +vm-07-05: train_samples ................................... 10240 +vm-07-05: train_shuffle_idx_path .......................... None +vm-07-05: train_tokens .................................... 300000000000 +vm-07-05: transformer_impl ................................ local +vm-07-05: transformer_pipeline_model_parallel_size ........ 1 +vm-07-05: universal_checkpoint ............................ False +vm-07-05: untie_embeddings_and_output_weights ............. False +vm-07-05: use_checkpoint_args ............................. False +vm-07-05: use_checkpoint_opt_param_scheduler .............. False +vm-07-05: use_contiguous_buffers_in_local_ddp ............. True +vm-07-05: use_cpu_initialization .......................... None +vm-07-05: use_dataset_only ................................ False +vm-07-05: use_distributed_optimizer ....................... True +vm-07-05: use_flash_attn .................................. False +vm-07-05: use_flash_attn_triton ........................... False +vm-07-05: use_flash_attn_v1 ............................... False +vm-07-05: use_flash_attn_v2 ............................... False +vm-07-05: use_one_sent_docs ............................... False +vm-07-05: use_pin_memory .................................. False +vm-07-05: use_ring_exchange_p2p ........................... False +vm-07-05: use_rotary_position_embeddings .................. True +vm-07-05: use_tutel ....................................... False +vm-07-05: valid_data_path ................................. None +vm-07-05: variable_seq_lengths ............................ False +vm-07-05: virtual_pipeline_model_parallel_size ............ None +vm-07-05: vision_backbone_type ............................ vit +vm-07-05: vision_pretraining .............................. False +vm-07-05: vision_pretraining_type ......................... classify +vm-07-05: vocab_extra_ids ................................. 0 +vm-07-05: vocab_file ...................................... gpt2-vocab.json +vm-07-05: vocab_size ...................................... None +vm-07-05: weight_decay .................................... 0.1 +vm-07-05: weight_decay_incr_style ......................... constant +vm-07-05: world_size ...................................... 16 +vm-07-05: zero_allgather_bucket_size ...................... 0.0 +vm-07-05: zero_contagious_gradients ....................... False +vm-07-05: zero_reduce_bucket_size ......................... 0.0 +vm-07-05: zero_reduce_scatter ............................. False +vm-07-05: zero_stage ...................................... 1 +vm-07-05: -------------------- end of arguments --------------------- +vm-07-05: setting number of micro-batches to constant 64 +vm-07-05: > building GPT2BPETokenizer tokenizer ... +vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-14: async_io ............... [NO] ....... [OKAY] +vm-07-14: fused_adam ............. [NO] ....... [OKAY] +vm-07-14: cpu_adam ............... [NO] ....... [OKAY] +vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-14: cpu_lion ............... [NO] ....... [OKAY] +vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-14: evoformer_attn ......... [NO] ....... [NO] +vm-07-14: fused_lamb ............. [NO] ....... [OKAY] +vm-07-14: fused_lion ............. [NO] ....... [OKAY] +vm-07-14: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-14: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-14: quantizer .............. [NO] ....... [OKAY] +vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-14: ragged_ops ............. [NO] ....... [OKAY] +vm-07-14: random_ltd ............. [NO] ....... [OKAY] +vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-14: sparse_attn ............ [NO] ....... [NO] +vm-07-14: spatial_inference ...... [NO] ....... [OKAY] +vm-07-14: transformer ............ [NO] ....... [OKAY] +vm-07-14: stochastic_transformer . [NO] ....... [OKAY] +vm-07-14: transformer_inference .. [NO] ....... [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed general environment info: +vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-14: torch version .................... 2.1.0a0+gita09f30a +vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-14: torch cuda version ............... None +vm-07-14: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-14: nvcc version ..................... None +vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-14: shared memory (/dev/shm) size .... 865.10 GB +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed C++/CUDA extension op report +vm-07-05: -------------------------------------------------- +vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-05: runtime if needed. Op compatibility means that your system +vm-07-05: meet the required dependencies to JIT install the op. +vm-07-05: -------------------------------------------------- +vm-07-05: JIT compiled ops requires ninja +vm-07-05: ninja .................. [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: op name ................ installed .. compatible +vm-07-05: -------------------------------------------------- +vm-07-05: [2023-11-29 08:50:53,882] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed C++/CUDA extension op report +vm-07-05: -------------------------------------------------- +vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-05: runtime if needed. Op compatibility means that your system +vm-07-05: meet the required dependencies to JIT install the op. +vm-07-05: -------------------------------------------------- +vm-07-05: JIT compiled ops requires ninja +vm-07-05: ninja .................. [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: op name ................ installed .. compatible +vm-07-05: -------------------------------------------------- +vm-07-05: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +vm-07-05: > initializing torch distributed ... +vm-07-05: [2023-11-29 08:50:53,893] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-05: [2023-11-29 08:50:53,893] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +vm-07-05: [2023-11-29 08:50:53,901] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed C++/CUDA extension op report +vm-07-05: -------------------------------------------------- +vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-05: runtime if needed. Op compatibility means that your system +vm-07-05: meet the required dependencies to JIT install the op. +vm-07-05: -------------------------------------------------- +vm-07-05: JIT compiled ops requires ninja +vm-07-05: ninja .................. [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: op name ................ installed .. compatible +vm-07-05: -------------------------------------------------- +vm-07-05: async_io ............... [NO] ....... [OKAY] +vm-07-05: fused_adam ............. [NO] ....... [OKAY] +vm-07-05: cpu_adam ............... [NO] ....... [OKAY] +vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-05: cpu_lion ............... [NO] ....... [OKAY] +vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-05: evoformer_attn ......... [NO] ....... [NO] +vm-07-05: fused_lamb ............. [NO] ....... [OKAY] +vm-07-05: fused_lion ............. [NO] ....... [OKAY] +vm-07-05: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-05: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-05: quantizer .............. [NO] ....... [OKAY] +vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-05: ragged_ops ............. [NO] ....... [OKAY] +vm-07-05: random_ltd ............. [NO] ....... [OKAY] +vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-05: sparse_attn ............ [NO] ....... [NO] +vm-07-05: spatial_inference ...... [NO] ....... [OKAY] +vm-07-05: transformer ............ [NO] ....... [OKAY] +vm-07-05: stochastic_transformer . [NO] ....... [OKAY] +vm-07-05: transformer_inference .. [NO] ....... [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed general environment info: +vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-05: torch version .................... 2.1.0a0+gita09f30a +vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-05: torch cuda version ............... None +vm-07-05: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-05: nvcc version ..................... None +vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-05: shared memory (/dev/shm) size .... 865.10 GB +vm-07-05: async_io ............... [NO] ....... [OKAY] +vm-07-05: fused_adam ............. [NO] ....... [OKAY] +vm-07-05: cpu_adam ............... [NO] ....... [OKAY] +vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-05: cpu_lion ............... [NO] ....... [OKAY] +vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-05: evoformer_attn ......... [NO] ....... [NO] +vm-07-05: fused_lamb ............. [NO] ....... [OKAY] +vm-07-05: fused_lion ............. [NO] ....... [OKAY] +vm-07-05: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-05: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-05: quantizer .............. [NO] ....... [OKAY] +vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-05: ragged_ops ............. [NO] ....... [OKAY] +vm-07-05: random_ltd ............. [NO] ....... [OKAY] +vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-05: sparse_attn ............ [NO] ....... [NO] +vm-07-05: spatial_inference ...... [NO] ....... [OKAY] +vm-07-05: transformer ............ [NO] ....... [OKAY] +vm-07-05: stochastic_transformer . [NO] ....... [OKAY] +vm-07-05: transformer_inference .. [NO] ....... [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed general environment info: +vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-05: torch version .................... 2.1.0a0+gita09f30a +vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-05: torch cuda version ............... None +vm-07-05: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-05: nvcc version ..................... None +vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-05: shared memory (/dev/shm) size .... 865.10 GB +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed C++/CUDA extension op report +vm-07-05: -------------------------------------------------- +vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-05: runtime if needed. Op compatibility means that your system +vm-07-05: meet the required dependencies to JIT install the op. +vm-07-05: -------------------------------------------------- +vm-07-05: JIT compiled ops requires ninja +vm-07-05: ninja .................. [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: op name ................ installed .. compatible +vm-07-05: -------------------------------------------------- +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed C++/CUDA extension op report +vm-07-14: -------------------------------------------------- +vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-14: runtime if needed. Op compatibility means that your system +vm-07-14: meet the required dependencies to JIT install the op. +vm-07-14: -------------------------------------------------- +vm-07-14: JIT compiled ops requires ninja +vm-07-14: ninja .................. [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: op name ................ installed .. compatible +vm-07-14: -------------------------------------------------- +vm-07-14: [2023-11-29 08:50:53,958] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed C++/CUDA extension op report +vm-07-05: -------------------------------------------------- +vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-05: runtime if needed. Op compatibility means that your system +vm-07-05: meet the required dependencies to JIT install the op. +vm-07-05: -------------------------------------------------- +vm-07-05: JIT compiled ops requires ninja +vm-07-05: ninja .................. [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: op name ................ installed .. compatible +vm-07-05: -------------------------------------------------- +vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-05: async_io ............... [NO] ....... [OKAY] +vm-07-05: fused_adam ............. [NO] ....... [OKAY] +vm-07-05: cpu_adam ............... [NO] ....... [OKAY] +vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-05: cpu_lion ............... [NO] ....... [OKAY] +vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-05: evoformer_attn ......... [NO] ....... [NO] +vm-07-05: fused_lamb ............. [NO] ....... [OKAY] +vm-07-05: fused_lion ............. [NO] ....... [OKAY] +vm-07-05: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-05: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-05: quantizer .............. [NO] ....... [OKAY] +vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-05: ragged_ops ............. [NO] ....... [OKAY] +vm-07-05: random_ltd ............. [NO] ....... [OKAY] +vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-05: sparse_attn ............ [NO] ....... [NO] +vm-07-05: spatial_inference ...... [NO] ....... [OKAY] +vm-07-05: transformer ............ [NO] ....... [OKAY] +vm-07-05: stochastic_transformer . [NO] ....... [OKAY] +vm-07-05: transformer_inference .. [NO] ....... [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed general environment info: +vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-05: torch version .................... 2.1.0a0+gita09f30a +vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-05: torch cuda version ............... None +vm-07-05: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-05: nvcc version ..................... None +vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-05: shared memory (/dev/shm) size .... 865.10 GB +vm-07-14: async_io ............... [NO] ....... [OKAY] +vm-07-14: fused_adam ............. [NO] ....... [OKAY] +vm-07-14: cpu_adam ............... [NO] ....... [OKAY] +vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-14: cpu_lion ............... [NO] ....... [OKAY] +vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-14: evoformer_attn ......... [NO] ....... [NO] +vm-07-14: fused_lamb ............. [NO] ....... [OKAY] +vm-07-14: fused_lion ............. [NO] ....... [OKAY] +vm-07-14: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-14: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-14: quantizer .............. [NO] ....... [OKAY] +vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-14: ragged_ops ............. [NO] ....... [OKAY] +vm-07-14: random_ltd ............. [NO] ....... [OKAY] +vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-14: sparse_attn ............ [NO] ....... [NO] +vm-07-14: spatial_inference ...... [NO] ....... [OKAY] +vm-07-14: transformer ............ [NO] ....... [OKAY] +vm-07-14: stochastic_transformer . [NO] ....... [OKAY] +vm-07-14: transformer_inference .. [NO] ....... [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed general environment info: +vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-14: torch version .................... 2.1.0a0+gita09f30a +vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-14: torch cuda version ............... None +vm-07-14: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-14: nvcc version ..................... None +vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-14: shared memory (/dev/shm) size .... 865.10 GB +vm-07-05: async_io ............... [NO] ....... [OKAY] +vm-07-05: fused_adam ............. [NO] ....... [OKAY] +vm-07-05: cpu_adam ............... [NO] ....... [OKAY] +vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-05: cpu_lion ............... [NO] ....... [OKAY] +vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-05: evoformer_attn ......... [NO] ....... [NO] +vm-07-05: fused_lamb ............. [NO] ....... [OKAY] +vm-07-05: fused_lion ............. [NO] ....... [OKAY] +vm-07-05: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-05: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-05: quantizer .............. [NO] ....... [OKAY] +vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-05: ragged_ops ............. [NO] ....... [OKAY] +vm-07-05: random_ltd ............. [NO] ....... [OKAY] +vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-05: sparse_attn ............ [NO] ....... [NO] +vm-07-05: spatial_inference ...... [NO] ....... [OKAY] +vm-07-05: transformer ............ [NO] ....... [OKAY] +vm-07-05: stochastic_transformer . [NO] ....... [OKAY] +vm-07-05: transformer_inference .. [NO] ....... [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed general environment info: +vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-05: torch version .................... 2.1.0a0+gita09f30a +vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-05: torch cuda version ............... None +vm-07-05: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-05: nvcc version ..................... None +vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-05: shared memory (/dev/shm) size .... 865.10 GB +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed C++/CUDA extension op report +vm-07-14: -------------------------------------------------- +vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-14: runtime if needed. Op compatibility means that your system +vm-07-14: meet the required dependencies to JIT install the op. +vm-07-14: -------------------------------------------------- +vm-07-14: JIT compiled ops requires ninja +vm-07-14: ninja .................. [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: op name ................ installed .. compatible +vm-07-14: -------------------------------------------------- +vm-07-05: async_io ............... [NO] ....... [OKAY] +vm-07-05: fused_adam ............. [NO] ....... [OKAY] +vm-07-05: cpu_adam ............... [NO] ....... [OKAY] +vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-05: cpu_lion ............... [NO] ....... [OKAY] +vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-05: evoformer_attn ......... [NO] ....... [NO] +vm-07-05: fused_lamb ............. [NO] ....... [OKAY] +vm-07-05: fused_lion ............. [NO] ....... [OKAY] +vm-07-05: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-05: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-05: quantizer .............. [NO] ....... [OKAY] +vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-05: ragged_ops ............. [NO] ....... [OKAY] +vm-07-05: random_ltd ............. [NO] ....... [OKAY] +vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-05: sparse_attn ............ [NO] ....... [NO] +vm-07-05: spatial_inference ...... [NO] ....... [OKAY] +vm-07-05: transformer ............ [NO] ....... [OKAY] +vm-07-05: stochastic_transformer . [NO] ....... [OKAY] +vm-07-05: transformer_inference .. [NO] ....... [OKAY] +vm-07-05: -------------------------------------------------- +vm-07-05: DeepSpeed general environment info: +vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-05: torch version .................... 2.1.0a0+gita09f30a +vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-05: torch cuda version ............... None +vm-07-05: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-05: nvcc version ..................... None +vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-05: shared memory (/dev/shm) size .... 865.10 GB +vm-07-05: [2023-11-29 08:50:54,034] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed C++/CUDA extension op report +vm-07-14: -------------------------------------------------- +vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-14: runtime if needed. Op compatibility means that your system +vm-07-14: meet the required dependencies to JIT install the op. +vm-07-14: -------------------------------------------------- +vm-07-14: JIT compiled ops requires ninja +vm-07-14: ninja .................. [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: op name ................ installed .. compatible +vm-07-14: -------------------------------------------------- +vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed C++/CUDA extension op report +vm-07-14: -------------------------------------------------- +vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-14: runtime if needed. Op compatibility means that your system +vm-07-14: meet the required dependencies to JIT install the op. +vm-07-14: -------------------------------------------------- +vm-07-14: JIT compiled ops requires ninja +vm-07-14: ninja .................. [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: op name ................ installed .. compatible +vm-07-14: -------------------------------------------------- +vm-07-05: [2023-11-29 08:50:54,051] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-14: async_io ............... [NO] ....... [OKAY] +vm-07-14: fused_adam ............. [NO] ....... [OKAY] +vm-07-14: cpu_adam ............... [NO] ....... [OKAY] +vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-14: cpu_lion ............... [NO] ....... [OKAY] +vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-14: evoformer_attn ......... [NO] ....... [NO] +vm-07-14: fused_lamb ............. [NO] ....... [OKAY] +vm-07-14: fused_lion ............. [NO] ....... [OKAY] +vm-07-14: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-14: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-14: quantizer .............. [NO] ....... [OKAY] +vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-14: ragged_ops ............. [NO] ....... [OKAY] +vm-07-14: random_ltd ............. [NO] ....... [OKAY] +vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-14: sparse_attn ............ [NO] ....... [NO] +vm-07-14: spatial_inference ...... [NO] ....... [OKAY] +vm-07-14: transformer ............ [NO] ....... [OKAY] +vm-07-14: stochastic_transformer . [NO] ....... [OKAY] +vm-07-14: transformer_inference .. [NO] ....... [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed general environment info: +vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-14: torch version .................... 2.1.0a0+gita09f30a +vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-14: torch cuda version ............... None +vm-07-14: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-14: nvcc version ..................... None +vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-14: shared memory (/dev/shm) size .... 865.10 GB +vm-07-05: [2023-11-29 08:50:54,091] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed C++/CUDA extension op report +vm-07-14: -------------------------------------------------- +vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-14: runtime if needed. Op compatibility means that your system +vm-07-14: meet the required dependencies to JIT install the op. +vm-07-14: -------------------------------------------------- +vm-07-14: JIT compiled ops requires ninja +vm-07-14: ninja .................. [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: op name ................ installed .. compatible +vm-07-14: -------------------------------------------------- +vm-07-05: [2023-11-29 08:50:54,104] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: async_io ............... [NO] ....... [OKAY] +vm-07-14: fused_adam ............. [NO] .......async_io [OKAY] +vm-07-14: ............... [NO] ....... cpu_adam[OKAY] +vm-07-14: ............... [NO] ....... [OKAY] +vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-14: fused_adam cpu_lion............. ...............[NO] [NO]....... .......[OKAY] +vm-07-14: [OKAY] +vm-07-14: cpu_adam ............... [NO] ....... [OKAY] +vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-14: evoformer_attncpu_lion ........................ [NO][NO] .............. [NO][OKAY] +vm-07-14: +vm-07-14: fused_lamb ............. [NO] ....... [OKAY] +vm-07-14: fused_lion ............. [NO] ....... [OKAY] +vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-14: evoformer_attn ......... [NO] ....... [NO] +vm-07-14: fused_lamb ............. [NO] ....... [OKAY] +vm-07-14: inference_core_ops ..... [NO] ....... fused_lion[OKAY] +vm-07-14: ............. [NO] ....... [OKAY] +vm-07-14: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-14: quantizer .............. [NO] ....... [OKAY] +vm-07-14: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-14: cutlass_ops ............ [NO] ....... ragged_ops[OKAY] +vm-07-14: ............. [NO]quantizer ..................... [OKAY][NO] +vm-07-14: ....... [OKAY]random_ltd +vm-07-14: ............. [NO] ....... [OKAY] +vm-07-14: ragged_device_ops ...... [NO] ....... [WARNING]  sparse_attn is not compatible with ROCM +vm-07-14: [OKAY] +vm-07-14: sparse_attn ............ [NO]ragged_ops .................... [NO][NO] +vm-07-14: ....... [OKAY] +vm-07-14: random_ltdspatial_inference ................... [NO][NO] .............. [OKAY][OKAY] +vm-07-14: +vm-07-14: transformer ............ [WARNING]  sparse_attn is not compatible with ROCM +vm-07-14: [NO] sparse_attn....... ............[OKAY] +vm-07-14: [NO] ....... [NO]stochastic_transformer +vm-07-14: . [NO] ....... [OKAY] +vm-07-14: spatial_inference ...... [NO] ....... [OKAY] +vm-07-14: transformer ............transformer_inference [NO].. .......[NO] [OKAY]....... +vm-07-14: [OKAY] +vm-07-14: --------------------------------------------------stochastic_transformer +vm-07-14: . [NO] ....... [OKAY] +vm-07-14: transformer_inference .. [NO] ....... [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed general environment info: +vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-14: torch version .................... 2.1.0a0+gita09f30a +vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-14: deepspeed info DeepSpeed general environment info:................... +vm-07-14: 0.12.3, unknown, unknown +vm-07-14: torch install pathtorch cuda version .............................. None +vm-07-14: torch hip version ................ 5.7.31920-f5021ed14['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-14: +vm-07-14: nvcc version ..................... Nonetorch version +vm-07-14: deepspeed wheel compiled w..................... ...... 2.1.0a0+gita09f30atorch 2.1, hip 5.7 +vm-07-14: +vm-07-14: deepspeed install pathshared memory (/dev/shm) size ............... 865.10 GB['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-14: +vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-14: torch cuda version ............... None +vm-07-14: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-14: nvcc version ..................... None +vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-14: shared memory (/dev/shm) size .... 865.10 GB +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed C++/CUDA extension op report +vm-07-14: -------------------------------------------------- +vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-14: runtime if needed. Op compatibility means that your system +vm-07-14: meet the required dependencies to JIT install the op. +vm-07-14: -------------------------------------------------- +vm-07-14: JIT compiled ops requires ninja +vm-07-14: ninja .................. [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: op name ................ installed .. compatible +vm-07-14: -------------------------------------------------- +vm-07-14: [2023-11-29 08:50:54,109] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-05: [2023-11-29 08:50:54,130] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed C++/CUDA extension op report +vm-07-14: -------------------------------------------------- +vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at +vm-07-14: runtime if needed. Op compatibility means that your system +vm-07-14: meet the required dependencies to JIT install the op. +vm-07-14: -------------------------------------------------- +vm-07-14: JIT compiled ops requires ninja +vm-07-14: ninja .................. [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: op name ................ installed .. compatible +vm-07-14: -------------------------------------------------- +vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-14: async_io ............... [NO] ....... [OKAY] +vm-07-14: fused_adam ............. [NO] ....... [OKAY] +vm-07-14: cpu_adam ............... [NO] ....... [OKAY] +vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-14: cpu_lion ............... [NO] ....... [OKAY] +vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-14: evoformer_attn ......... [NO] ....... [NO] +vm-07-14: fused_lamb ............. [NO] ....... [OKAY] +vm-07-14: fused_lion ............. [NO] ....... [OKAY] +vm-07-14: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-14: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-14: quantizer .............. [NO] ....... [OKAY] +vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-14: ragged_ops ............. [NO] ....... [OKAY] +vm-07-14: random_ltd ............. [NO] ....... [OKAY] +vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-14: sparse_attn ............ [NO] ....... [NO] +vm-07-14: spatial_inference ...... [NO] ....... [OKAY] +vm-07-14: transformer ............ [NO] ....... [OKAY] +vm-07-14: stochastic_transformer . [NO] ....... [OKAY] +vm-07-14: transformer_inference .. [NO] ....... [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed general environment info: +vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-14: torch version .................... 2.1.0a0+gita09f30a +vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-14: torch cuda version ............... None +vm-07-14: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-14: nvcc version ..................... None +vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-14: shared memory (/dev/shm) size .... 865.10 GB +vm-07-14: async_io ............... [NO] ....... [OKAY] +vm-07-14: fused_adam ............. [NO] ....... [OKAY] +vm-07-14: cpu_adam ............... [NO] ....... [OKAY] +vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-14: cpu_lion ............... [NO] ....... [OKAY] +vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-14: evoformer_attn ......... [NO] ....... [NO] +vm-07-14: fused_lamb ............. [NO] ....... [OKAY] +vm-07-14: fused_lion ............. [NO] ....... [OKAY] +vm-07-14: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-14: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-14: quantizer .............. [NO] ....... [OKAY] +vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-14: ragged_ops ............. [NO] ....... [OKAY] +vm-07-14: random_ltd ............. [NO] ....... [OKAY] +vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-14: sparse_attn ............ [NO] ....... [NO] +vm-07-14: spatial_inference ...... [NO] ....... [OKAY] +vm-07-14: transformer ............ [NO] ....... [OKAY] +vm-07-14: stochastic_transformer . [NO] ....... [OKAY] +vm-07-14: transformer_inference .. [NO] ....... [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed general environment info: +vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-14: torch version .................... 2.1.0a0+gita09f30a +vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-14: torch cuda version ............... None +vm-07-14: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-14: nvcc version ..................... None +vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-14: shared memory (/dev/shm) size .... 865.10 GB +vm-07-14: [2023-11-29 08:50:54,175] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: async_io ............... [NO] ....... [OKAY] +vm-07-14: fused_adam ............. [NO] ....... [OKAY] +vm-07-14: cpu_adam ............... [NO] ....... [OKAY] +vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY] +vm-07-14: cpu_lion ............... [NO] ....... [OKAY] +vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +vm-07-14: evoformer_attn ......... [NO] ....... [NO] +vm-07-14: fused_lamb ............. [NO] ....... [OKAY] +vm-07-14: fused_lion ............. [NO] ....... [OKAY] +vm-07-14: inference_core_ops ..... [NO] ....... [OKAY] +vm-07-14: cutlass_ops ............ [NO] ....... [OKAY] +vm-07-14: quantizer .............. [NO] ....... [OKAY] +vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY] +vm-07-14: ragged_ops ............. [NO] ....... [OKAY] +vm-07-14: random_ltd ............. [NO] ....... [OKAY] +vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM +vm-07-14: sparse_attn ............ [NO] ....... [NO] +vm-07-14: spatial_inference ...... [NO] ....... [OKAY] +vm-07-14: transformer ............ [NO] ....... [OKAY] +vm-07-14: stochastic_transformer . [NO] ....... [OKAY] +vm-07-14: transformer_inference .. [NO] ....... [OKAY] +vm-07-14: -------------------------------------------------- +vm-07-14: DeepSpeed general environment info: +vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch'] +vm-07-14: torch version .................... 2.1.0a0+gita09f30a +vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed'] +vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown +vm-07-14: torch cuda version ............... None +vm-07-14: torch hip version ................ 5.7.31920-f5021ed14 +vm-07-14: nvcc version ..................... None +vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7 +vm-07-14: shared memory (/dev/shm) size .... 865.10 GB +vm-07-14: [2023-11-29 08:50:54,199] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: [2023-11-29 08:50:54,203] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main **** +vm-07-14: [2023-11-29 08:50:54,248] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: [2023-11-29 08:50:54,255] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-14: [2023-11-29 08:50:54,282] [INFO] [comm.py:637:init_distributed] cdb=None +vm-07-05: > initialized tensor model parallel with size 1 +vm-07-05: > initialized pipeline model parallel with size 1 +vm-07-05: > setting random seeds to 1234 ... +vm-07-05: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +vm-07-05: > compiling dataset index builder ... +vm-07-05: make: Entering directory '/root/Megatron-DeepSpeed/megatron/data' +vm-07-05: make: Nothing to be done for 'default'. +vm-07-05: make: Leaving directory '/root/Megatron-DeepSpeed/megatron/data' +vm-07-05: >>> done with dataset index builder. Compilation time: 0.047 seconds +vm-07-05: > compiling and loading fused kernels ... +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +vm-07-05: Total number of unsupported CUDA function calls: 0 +vm-07-05: +vm-07-05: +vm-07-05: Total number of replaced kernel launches: 99 +vm-07-05: [1/1] c++ scaled_upper_triang_masked_softmax_hip.o scaled_upper_triang_masked_softmax_hip.cuda.o -shared -L/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/opt/rocm-6.0.0-12660/lib -lamdhip64 -o scaled_upper_triang_masked_softmax_cuda.so +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +vm-07-05: Total number of unsupported CUDA function calls: 0 +vm-07-05: +vm-07-05: +vm-07-05: Total number of replaced kernel launches: 69 +vm-07-05: [1/1] c++ scaled_masked_softmax_hip.o scaled_masked_softmax_hip.cuda.o -shared -L/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/opt/rocm-6.0.0-12660/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_softmax.cpp -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_softmax_hip.cpp [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_softmax_cuda.cu -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_softmax_hip.hip [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +vm-07-05: Total number of unsupported CUDA function calls: 0 +vm-07-05: +vm-07-05: +vm-07-05: Total number of replaced kernel launches: 69 +vm-07-05: ninja: no work to do. +vm-07-05: >>> done with compiling and loading fused kernels. Compilation time: 2.092 seconds +vm-07-05: time to initialize megatron (seconds): 4.207 +vm-07-05: [after megatron is initialized] datetime: 2023-11-29 08:50:57 +vm-07-05: building GPT model ... +vm-07-05: [2023-11-29 08:50:57,577] [INFO] [utils.py:802:see_memory_usage] Before Building Model +vm-07-05: [2023-11-29 08:50:57,578] [INFO] [utils.py:803:see_memory_usage] MA 0.0 GB Max_MA 2.13 GB CA 0.0 GB Max_CA 2 GB +vm-07-05: [2023-11-29 08:50:57,578] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 45.89 GB, percent = 2.7% +vm-07-05: [2023-11-29 08:50:57,670] [INFO] [utils.py:802:see_memory_usage] After Building Model +vm-07-05: [2023-11-29 08:50:57,671] [INFO] [utils.py:803:see_memory_usage] MA 12.39 GB Max_MA 12.39 GB CA 12.39 GB Max_CA 12 GB +vm-07-05: [2023-11-29 08:50:57,671] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 45.9 GB, percent = 2.7% +vm-07-05: > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 6650208256 +vm-07-05: setting training iterations to 5 +vm-07-05: > learning rate decay style: cosine +vm-07-05: DeepSpeed is enabled. +vm-07-05: [2023-11-29 08:50:57,673] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.3, git-hash=unknown, git-branch=unknown +vm-07-05: [2023-11-29 08:50:57,862] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +vm-07-05: [2023-11-29 08:50:57,863] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer +vm-07-05: [2023-11-29 08:50:57,863] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +vm-07-05: [2023-11-29 08:50:57,874] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +vm-07-05: [2023-11-29 08:50:57,874] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +vm-07-05: [2023-11-29 08:50:57,874] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 1 optimizer +vm-07-05: [2023-11-29 08:50:57,874] [INFO] [stage_1_and_2.py:147:__init__] Reduce bucket size 500,000,000 +vm-07-05: [2023-11-29 08:50:57,874] [INFO] [stage_1_and_2.py:148:__init__] Allgather bucket size 500,000,000 +vm-07-05: [2023-11-29 08:50:57,874] [INFO] [stage_1_and_2.py:149:__init__] CPU Offload: False +vm-07-05: [2023-11-29 08:50:57,874] [INFO] [stage_1_and_2.py:150:__init__] Round robin gradient partitioning: False +vm-07-05: [2023-11-29 08:51:11,210] [INFO] [utils.py:802:see_memory_usage] Before initializing optimizer states +vm-07-05: [2023-11-29 08:51:11,211] [INFO] [utils.py:803:see_memory_usage] MA 13.94 GB Max_MA 13.94 GB CA 13.96 GB Max_CA 14 GB +vm-07-05: [2023-11-29 08:51:11,211] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 217.2 GB, percent = 12.6% +vm-07-05: [2023-11-29 08:51:11,357] [INFO] [utils.py:802:see_memory_usage] After initializing optimizer states +vm-07-05: [2023-11-29 08:51:11,358] [INFO] [utils.py:803:see_memory_usage] MA 17.04 GB Max_MA 18.58 GB CA 18.6 GB Max_CA 19 GB +vm-07-05: [2023-11-29 08:51:11,358] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 218.51 GB, percent = 12.6% +vm-07-05: [2023-11-29 08:51:11,358] [INFO] [stage_1_and_2.py:514:__init__] optimizer state initialized +vm-07-05: [2023-11-29 08:51:11,648] [INFO] [utils.py:802:see_memory_usage] After initializing ZeRO optimizer +vm-07-05: [2023-11-29 08:51:11,648] [INFO] [utils.py:803:see_memory_usage] MA 17.04 GB Max_MA 17.04 GB CA 18.6 GB Max_CA 19 GB +vm-07-05: [2023-11-29 08:51:11,648] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 221.2 GB, percent = 12.8% +vm-07-05: [2023-11-29 08:51:11,650] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +vm-07-05: [2023-11-29 08:51:11,650] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +vm-07-05: [2023-11-29 08:51:11,650] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = +vm-07-05: [2023-11-29 08:51:11,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] +vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:974:print] DeepSpeedEngine configuration: +vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] activation_checkpointing_config { +vm-07-05: "partition_activations": false, +vm-07-05: "contiguous_memory_optimization": false, +vm-07-05: "cpu_checkpointing": false, +vm-07-05: "number_checkpoints": null, +vm-07-05: "synchronize_checkpoint_boundary": false, +vm-07-05: "profile": false +vm-07-05: } +vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] amp_enabled .................. False +vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] amp_params ................... False +vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] autotuning_config ............ { +vm-07-05: "enabled": false, +vm-07-05: "start_step": null, +vm-07-05: "end_step": null, +vm-07-05: "metric_path": null, +vm-07-05: "arg_mappings": null, +vm-07-05: "metric": "throughput", +vm-07-05: "model_info": null, +vm-07-05: "results_dir": "autotuning_results", +vm-07-05: "exps_dir": "autotuning_exps", +vm-07-05: "overwrite": true, +vm-07-05: "fast": true, +vm-07-05: "start_profile_step": 3, +vm-07-05: "end_profile_step": 5, +vm-07-05: "tuner_type": "gridsearch", +vm-07-05: "tuner_early_stopping": 5, +vm-07-05: "tuner_num_trials": 50, +vm-07-05: "model_info_path": null, +vm-07-05: "mp_size": 1, +vm-07-05: "max_train_batch_size": null, +vm-07-05: "min_train_batch_size": 1, +vm-07-05: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +vm-07-05: "min_train_micro_batch_size_per_gpu": 1, +vm-07-05: "num_tuning_micro_batch_sizes": 3 +vm-07-05: } +vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] bfloat16_enabled ............. False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] checkpoint_parallel_write_pipeline False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] checkpoint_tag_validation_enabled True +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] checkpoint_tag_validation_fail False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] comms_config ................. +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] communication_data_type ...... None +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] curriculum_enabled_legacy .... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] curriculum_params_legacy ..... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] data_efficiency_enabled ...... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] dataloader_drop_last ......... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] disable_allgather ............ False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] dump_state ................... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] dynamic_loss_scale_args ...... {'init_scale': 2048, 'scale_window': 500, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1} +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_enabled ........... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_gas_boundary_resolution 1 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_layer_name ........ bert.encoder.layer +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_layer_num ......... 0 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_max_iter .......... 100 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_stability ......... 1e-06 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_tol ............... 0.01 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_verbose ........... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] elasticity_enabled ........... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] flops_profiler_config ........ { +vm-07-05: "enabled": false, +vm-07-05: "recompute_fwd_factor": 0.0, +vm-07-05: "profile_step": 1, +vm-07-05: "module_depth": -1, +vm-07-05: "top_modules": 1, +vm-07-05: "detailed": true, +vm-07-05: "output_file": null +vm-07-05: } +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] fp16_auto_cast ............... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] fp16_enabled ................. True +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] fp16_master_weights_and_gradients False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] global_rank .................. 0 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] grad_accum_dtype ............. None +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] gradient_accumulation_steps .. 64 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] gradient_clipping ............ 1.0 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] gradient_predivide_factor .... 1.0 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] initial_dynamic_scale ........ 2048 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] load_universal_checkpoint .... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] loss_scale ................... 0 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] memory_breakdown ............. False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] mics_hierarchial_params_gather False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] mics_shard_size .............. -1 +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] nebula_config ................ { +vm-07-05: "enabled": false, +vm-07-05: "persistent_storage_path": null, +vm-07-05: "persistent_time_interval": 100, +vm-07-05: "num_of_version_in_retention": 2, +vm-07-05: "enable_nebula_load": true, +vm-07-05: "load_path": null +vm-07-05: } +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] optimizer_legacy_fusion ...... False +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] optimizer_name ............... None +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] optimizer_params ............. None +vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] pld_enabled .................. False +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] pld_params ................... False +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] prescale_gradients ........... False +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] scheduler_name ............... None +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] scheduler_params ............. None +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] seq_parallel_communication_data_type torch.float32 +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] sparse_attention ............. None +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] sparse_gradients_enabled ..... False +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] steps_per_print .............. 1 +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] train_batch_size ............. 2048 +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] train_micro_batch_size_per_gpu 2 +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] use_node_local_storage ....... False +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] wall_clock_breakdown ......... False +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] weight_quantization_config ... None +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] world_size ................... 16 +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_allow_untested_optimizer False +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_enabled ................. True +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_force_ds_cpu_optimizer .. True +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_optimization_stage ...... 1 +vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:964:print_user_config] json = { +vm-07-05: "train_batch_size": 2.048000e+03, +vm-07-05: "train_micro_batch_size_per_gpu": 2, +vm-07-05: "steps_per_print": 1, +vm-07-05: "zero_optimization": { +vm-07-05: "stage": 1 +vm-07-05: }, +vm-07-05: "gradient_clipping": 1.0, +vm-07-05: "prescale_gradients": false, +vm-07-05: "fp16": { +vm-07-05: "enabled": true, +vm-07-05: "loss_scale": 0, +vm-07-05: "loss_scale_window": 500, +vm-07-05: "hysteresis": 2, +vm-07-05: "min_loss_scale": 1, +vm-07-05: "initial_scale_power": 11 +vm-07-05: }, +vm-07-05: "wall_clock_breakdown": false +vm-07-05: } +vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: [2023-11-29 08:51:14,001] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-14: [2023-11-29 08:51:13,999] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: WARNING: could not find the metadata file .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase +vm-07-14: [2023-11-29 08:51:13,999] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: will not load any checkpoints and will start from random +vm-07-14: [2023-11-29 08:51:13,999] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: [2023-11-29 08:51:14,001] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: [2023-11-29 08:51:14,001] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-14: [2023-11-29 08:51:14,013] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-14: [2023-11-29 08:51:14,013] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-14: [2023-11-29 08:51:14,013] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-14: [2023-11-29 08:51:14,014] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-14: [2023-11-29 08:51:14,014] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +vm-07-05: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-11-29 08:51:14 +vm-07-14: (min, max) time across ranks (ms): +vm-07-14: load-checkpoint ................................: (1.15, 16.61) +vm-07-05: > building train, validation, and test datasets ... +vm-07-05: > datasets target sizes (minimum size): +vm-07-05: train: 10240 +vm-07-05: validation: 20480 +vm-07-05: test: 20480 +vm-07-05: > building train, validation, and test datasets for GPT ... +vm-07-05: Single data path provided for train, valid & test +vm-07-05: > building dataset index ... +vm-07-05: reading sizes... +vm-07-05: reading pointers... +vm-07-05: reading document index... +vm-07-05: creating numpy buffer of mmap... +vm-07-05: creating memory view of numpy buffer... +vm-07-05: > finished creating indexed dataset in 0.000362 seconds +vm-07-05: number of documents: 115876 +vm-07-05: > dataset split: +vm-07-05: train: +vm-07-05: document indices in [0, 109966) total of 109966 documents +vm-07-05: validation: +vm-07-05: document indices in [109966, 115760) total of 5794 documents +vm-07-05: test: +vm-07-05: document indices in [115760, 115876) total of 116 documents +vm-07-05: > loading doc-idx mapping from /root/index-cache/06115e84e99e3b6ca4187cde686826c9_doc_idx.npy +vm-07-05: > loading sample-idx mapping from /root/index-cache/06115e84e99e3b6ca4187cde686826c9_sample_idx.npy +vm-07-05: > loading shuffle-idx mapping from /root/index-cache/06115e84e99e3b6ca4187cde686826c9_shuffle_idx.npy +vm-07-05: loaded indexed file in 0.001 seconds +vm-07-05: total number of samples: 10575 +vm-07-05: total number of epochs: 10 +vm-07-05: > loading doc-idx mapping from /root/index-cache/c9410219284a5371a54555ffc4190827_doc_idx.npy +vm-07-05: > loading sample-idx mapping from /root/index-cache/c9410219284a5371a54555ffc4190827_sample_idx.npy +vm-07-05: > loading shuffle-idx mapping from /root/index-cache/c9410219284a5371a54555ffc4190827_shuffle_idx.npy +vm-07-05: loaded indexed file in 0.001 seconds +vm-07-05: total number of samples: 20485 +vm-07-05: total number of epochs: 530 +vm-07-05: > loading doc-idx mapping from /root/index-cache/087bcc1d515023256208907c78e6a640_doc_idx.npy +vm-07-05: > loading sample-idx mapping from /root/index-cache/087bcc1d515023256208907c78e6a640_sample_idx.npy +vm-07-05: > loading shuffle-idx mapping from /root/index-cache/087bcc1d515023256208907c78e6a640_shuffle_idx.npy +vm-07-05: loaded indexed file in 0.001 seconds +vm-07-05: total number of samples: 20481 +vm-07-05: total number of epochs: 22018 +vm-07-05: > finished creating GPT datasets ... +vm-07-05: [after dataloaders are built] datetime: 2023-11-29 08:51:14 +vm-07-05: done with setup ... +vm-07-05: training ... +vm-07-14: (min, max) time across ranks (ms): +vm-07-14: model-and-optimizer-setup ......................: (16584.81, 16588.33) +vm-07-14: train/valid/test-data-iterators-setup ..........: (298.83, 342.60) +vm-07-05: [before the start of training step] datetime: 2023-11-29 08:51:14 +vm-07-05: [2023-11-29 08:52:27,711] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[1.2000000000000002e-07, 1.2000000000000002e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +vm-07-14: iteration 1/ 5 | consumed samples: 2048 | consumed tokens: 4194304 | elapsed time per iteration (ms): 76015.3 | learning rate: 1.200E-07 | global batch size: 2048 | lm loss: 1.100492E+01 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 26.942 | TFLOPs: 147.61 | +vm-07-05: [Rank 0] (after 1 iterations) memory (MB) | allocated: 17956.4931640625 | max allocated: 68581.8095703125 | reserved: 90178.0 | max reserved: 90178.0 +vm-07-05: [2023-11-29 08:53:42,835] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[2.4000000000000003e-07, 2.4000000000000003e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +vm-07-14: iteration 2/ 5 | consumed samples: 4096 | consumed tokens: 8388608 | elapsed time per iteration (ms): 75254.0 | learning rate: 2.400E-07 | global batch size: 2048 | lm loss: 1.100421E+01 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 27.215 | TFLOPs: 149.10 | +vm-07-05: [2023-11-29 08:54:58,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=3, skipped=0, lr=[3.6000000000000005e-07, 3.6000000000000005e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +vm-07-05: [2023-11-29 08:55:00,932] [INFO] [timer.py:260:stop] epoch=0/micro_step=3/global_step=3, RunningAvgSamplesPerSec=291.6169137604138, CurrSamplesPerSec=291.6169137604138, MemAllocated=17.54GB, MaxMemAllocated=66.97GB +vm-07-14: iteration 3/ 5 | consumed samples: 6144 | consumed tokens: 12582912 | elapsed time per iteration (ms): 75249.6 | learning rate: 3.600E-07 | global batch size: 2048 | lm loss: 1.096316E+01 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 27.216 | TFLOPs: 149.11 | +vm-07-05: [2023-11-29 08:56:13,554] [INFO] [logging.py:96:log_dist] [Rank 0] step=4, skipped=0, lr=[4.800000000000001e-07, 4.800000000000001e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +vm-07-05: [2023-11-29 08:56:16,103] [INFO] [timer.py:260:stop] epoch=0/micro_step=4/global_step=4, RunningAvgSamplesPerSec=303.1183419363961, CurrSamplesPerSec=315.5642580603495, MemAllocated=17.54GB, MaxMemAllocated=66.97GB +vm-07-14: iteration 4/ 5 | consumed samples: 8192 | consumed tokens: 16777216 | elapsed time per iteration (ms): 75182.4 | learning rate: 4.800E-07 | global batch size: 2048 | lm loss: 1.055474E+01 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 27.240 | TFLOPs: 149.24 | +vm-07-05: [2023-11-29 08:57:28,159] [INFO] [logging.py:96:log_dist] [Rank 0] step=5, skipped=0, lr=[6.000000000000001e-07, 6.000000000000001e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +vm-07-05: [2023-11-29 08:57:30,611] [INFO] [timer.py:260:stop] epoch=0/micro_step=5/global_step=5, RunningAvgSamplesPerSec=311.41529524062446, CurrSamplesPerSec=329.45074244342015, MemAllocated=17.54GB, MaxMemAllocated=66.97GB +vm-07-14: iteration 5/ 5 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (ms): 74494.9 | learning rate: 6.000E-07 | global batch size: 2048 | lm loss: 9.927882E+00 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 27.492 | TFLOPs: 150.62 | +vm-07-05: [after training is done] datetime: 2023-11-29 08:57:30 +vm-07-05: saving checkpoint at iteration 5 to .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase +vm-07-05: [2023-11-29 08:57:32,086] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48040 +vm-07-05: [2023-11-29 08:57:32,114] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48041 +vm-07-05: [2023-11-29 08:57:32,248] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48042 +vm-07-14: [2023-11-29 08:57:32,266] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49031 +vm-07-14: [2023-11-29 08:57:32,269] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49032 +vm-07-14: [2023-11-29 08:57:32,270] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49033 +vm-07-14: [2023-11-29 08:57:32,272] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49034 +vm-07-14: [2023-11-29 08:57:32,272] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49035 +vm-07-14: [2023-11-29 08:57:32,273] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49036 +vm-07-14: [2023-11-29 08:57:32,275] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49037 +vm-07-05: [2023-11-29 08:57:32,302] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48043 +vm-07-05: [2023-11-29 08:57:32,303] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48044 +vm-07-05: [2023-11-29 08:57:32,305] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48045 +vm-07-14: [2023-11-29 08:57:32,303] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49038 +vm-07-14: [2023-11-29 08:57:32,304] [ERROR] [launch.py:321:sigkill_handler] ['/opt/conda/envs/py_3.9/bin/python', '-u', '/root/Megatron-DeepSpeed/examples_deepspeed/rebase/../../pretrain_gpt.py', '--local_rank=7', '--override-opt_param-scheduler', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--tensor-model-parallel-size', '1', '--init-method-std', '0.009', '--lr-decay-samples', '43945312', '--lr-warmup-samples', '2048000', '--lr-decay-style', 'cosine', '--micro-batch-size', '2', '--exit-duration-in-mins', '30000000', '--global-batch-size', '2048', '--num-layers', '32', '--hidden-size', '4096', '--num-attention-heads', '32', '--seq-length', '2048', '--max-position-embeddings', '2048', '--train-tokens', '300000000000', '--train-samples', '10240', '--lr', '1.2e-4', '--min-lr', '1.0e-6', '--split', '949,50,1', '--log-interval', '1', '--eval-interval', '500', '--eval-iters', '10', '--save-interval', '10000', '--weight-decay', '0.1', '--clip-grad', '1.0', '--hysteresis', '2', '--num-workers', '2', '--attention-dropout', '0.0', '--hidden-dropout', '0.0', '--optimizer', 'adam', '--use-distributed-optimizer', '--sequence-parallel', '--fp16', '--seed', '1234', '--load', './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase', '--save', './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase', '--no-async-tensor-model-parallel-allreduce', '--use-rotary-position-embeddings', '--no-gradient-accumulation-fusion', '--vocab-file', 'gpt2-vocab.json', '--merge-file', 'gpt2-merges.txt', '--data-path', '/root//dataset_text_sentence', '--data-impl', 'mmap', '--deepspeed', '--deepspeed_config', 'ds_config_gbs2048_mbs2_log1_zero1.json', '--zero-stage', '1', '--pipeline-model-parallel-size', '1', '--no-pipeline-parallel'] exits with return code = 1 +vm-07-05: [2023-11-29 08:57:32,306] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48046 +vm-07-05: [2023-11-29 08:57:32,306] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48047 +vm-07-05: [2023-11-29 08:57:32,308] [ERROR] [launch.py:321:sigkill_handler] ['/opt/conda/envs/py_3.9/bin/python', '-u', '/root/Megatron-DeepSpeed/examples_deepspeed/rebase/../../pretrain_gpt.py', '--local_rank=7', '--override-opt_param-scheduler', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--tensor-model-parallel-size', '1', '--init-method-std', '0.009', '--lr-decay-samples', '43945312', '--lr-warmup-samples', '2048000', '--lr-decay-style', 'cosine', '--micro-batch-size', '2', '--exit-duration-in-mins', '30000000', '--global-batch-size', '2048', '--num-layers', '32', '--hidden-size', '4096', '--num-attention-heads', '32', '--seq-length', '2048', '--max-position-embeddings', '2048', '--train-tokens', '300000000000', '--train-samples', '10240', '--lr', '1.2e-4', '--min-lr', '1.0e-6', '--split', '949,50,1', '--log-interval', '1', '--eval-interval', '500', '--eval-iters', '10', '--save-interval', '10000', '--weight-decay', '0.1', '--clip-grad', '1.0', '--hysteresis', '2', '--num-workers', '2', '--attention-dropout', '0.0', '--hidden-dropout', '0.0', '--optimizer', 'adam', '--use-distributed-optimizer', '--sequence-parallel', '--fp16', '--seed', '1234', '--load', './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase', '--save', './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase', '--no-async-tensor-model-parallel-allreduce', '--use-rotary-position-embeddings', '--no-gradient-accumulation-fusion', '--vocab-file', 'gpt2-vocab.json', '--merge-file', 'gpt2-merges.txt', '--data-path', '/root//dataset_text_sentence', '--data-impl', 'mmap', '--deepspeed', '--deepspeed_config', 'ds_config_gbs2048_mbs2_log1_zero1.json', '--zero-stage', '1', '--pipeline-model-parallel-size', '1', '--no-pipeline-parallel'] exits with return code = 1 diff --git a/third_party/Makefile b/third_party/Makefile index b3ca1d1f1..660878b48 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -11,12 +11,12 @@ HPCX_HOME ?= /opt/hpcx CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2) ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) -.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt +.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed # Build all targets. all: cuda rocm -cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn -rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt +cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed +rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed cpu: common cpu_perftest common: cpu_hpl cpu_stream fio directx_amd: directx_amf_encoding_latency @@ -171,3 +171,20 @@ directx_amf_encoding_latency: del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \ "C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \ ) + +# Install Megatron-LM +megatron_lm: + if [ ! -d "Megatron/Megatron-LM" ]; then \ + git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \ + fi + cd Megatron && \ + python -m pip install -r requirements.txt + +# Install Megatron-DeepSpeed +megatron_deepspeed: + if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \ + git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \ + fi + cd Megatron && \ + python -m pip install -r requirements.txt && \ + python -m pip install DeepSpeed diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt new file mode 100644 index 000000000..8b7723298 --- /dev/null +++ b/third_party/Megatron/requirements.txt @@ -0,0 +1,13 @@ +nltk +parameterized +pybind11 +regex +six +# versions from HF transformers +black==21.4b0 +isort>=5.5.4 +tqdm +sentencepiece +wandb +einops +typing_extensions==4.5.0